aboutsummaryrefslogtreecommitdiff
path: root/llvm
diff options
context:
space:
mode:
Diffstat (limited to 'llvm')
-rw-r--r--llvm/benchmarks/CMakeLists.txt1
-rw-r--r--llvm/benchmarks/SpecialCaseListBM.cpp207
-rw-r--r--llvm/docs/AArch64SME.rst24
-rw-r--r--llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst62
-rw-r--r--llvm/docs/AMDGPUUsage.rst24
-rw-r--r--llvm/docs/CMakeLists.txt22
-rw-r--r--llvm/docs/CallGraphSection.md6
-rw-r--r--llvm/docs/CodeOfConduct.rst1
-rw-r--r--llvm/docs/DirectX/DXILResources.rst89
-rw-r--r--llvm/docs/GettingStartedVS.rst13
-rw-r--r--llvm/docs/HowToBuildOnARM.rst18
-rw-r--r--llvm/docs/HowToReleaseLLVM.rst82
-rw-r--r--llvm/docs/LangRef.rst55
-rw-r--r--llvm/docs/ReleaseNotes.md14
-rw-r--r--llvm/docs/SPIRVUsage.rst2
-rw-r--r--llvm/docs/TableGen/BackEnds.rst50
-rw-r--r--llvm/include/llvm-c/Core.h2
-rw-r--r--llvm/include/llvm-c/DebugInfo.h24
-rw-r--r--llvm/include/llvm/ADT/APFloat.h2
-rw-r--r--llvm/include/llvm/ADT/Bitfields.h88
-rw-r--r--llvm/include/llvm/ADT/StringExtras.h6
-rw-r--r--llvm/include/llvm/ADT/StringSwitch.h77
-rw-r--r--llvm/include/llvm/Analysis/DXILResource.h19
-rw-r--r--llvm/include/llvm/Analysis/IR2Vec.h2
-rw-r--r--llvm/include/llvm/Analysis/LoopInfo.h2
-rw-r--r--llvm/include/llvm/Analysis/ScalarEvolution.h4
-rw-r--r--llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h92
-rw-r--r--llvm/include/llvm/Analysis/StaticDataProfileInfo.h62
-rw-r--r--llvm/include/llvm/Analysis/TargetLibraryInfo.h6
-rw-r--r--llvm/include/llvm/BinaryFormat/ELF.h2
-rw-r--r--llvm/include/llvm/BinaryFormat/ELFRelocs/AArch64.def1
-rw-r--r--llvm/include/llvm/CAS/CASID.h3
-rw-r--r--llvm/include/llvm/CodeGen/AsmPrinter.h8
-rw-r--r--llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h3
-rw-r--r--llvm/include/llvm/CodeGen/ISDOpcodes.h6
-rw-r--r--llvm/include/llvm/CodeGen/LiveIntervals.h4
-rw-r--r--llvm/include/llvm/CodeGen/LiveRangeCalc.h2
-rw-r--r--llvm/include/llvm/CodeGen/SelectionDAG.h6
-rw-r--r--llvm/include/llvm/CodeGen/SelectionDAGNodes.h2
-rw-r--r--llvm/include/llvm/DebugInfo/GSYM/DwarfTransformer.h11
-rw-r--r--llvm/include/llvm/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.h5
-rw-r--r--llvm/include/llvm/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.h4
-rw-r--r--llvm/include/llvm/ExecutionEngine/Orc/MemoryMapper.h12
-rw-r--r--llvm/include/llvm/ExecutionEngine/Orc/Shared/AllocationActions.h13
-rw-r--r--llvm/include/llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h3
-rw-r--r--llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h26
-rw-r--r--llvm/include/llvm/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.h87
-rw-r--r--llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.h51
-rw-r--r--llvm/include/llvm/Frontend/OpenMP/ClauseT.h62
-rw-r--r--llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h6
-rw-r--r--llvm/include/llvm/Frontend/OpenMP/OMP.td11
-rw-r--r--llvm/include/llvm/Frontend/OpenMP/OMPKinds.def2
-rw-r--r--llvm/include/llvm/IR/CFG.h7
-rw-r--r--llvm/include/llvm/IR/ConstantFPRange.h13
-rw-r--r--llvm/include/llvm/IR/DebugProgramInstruction.h10
-rw-r--r--llvm/include/llvm/IR/IRBuilder.h5
-rw-r--r--llvm/include/llvm/IR/IntrinsicsDirectX.td3
-rw-r--r--llvm/include/llvm/IR/IntrinsicsRISCVXsf.td94
-rw-r--r--llvm/include/llvm/IR/IntrinsicsSPIRV.td3
-rw-r--r--llvm/include/llvm/IR/RuntimeLibcalls.td116
-rw-r--r--llvm/include/llvm/IR/Value.h4
-rw-r--r--llvm/include/llvm/LTO/LTO.h6
-rw-r--r--llvm/include/llvm/Object/ELFTypes.h19
-rw-r--r--llvm/include/llvm/ObjectYAML/ELFYAML.h1
-rw-r--r--llvm/include/llvm/ProfileData/InstrProfCorrelator.h2
-rw-r--r--llvm/include/llvm/Support/BinaryStreamWriter.h4
-rw-r--r--llvm/include/llvm/Support/Caching.h3
-rw-r--r--llvm/include/llvm/Support/DebugCounter.h3
-rw-r--r--llvm/include/llvm/Support/DebugLog.h12
-rw-r--r--llvm/include/llvm/Support/Format.h15
-rw-r--r--llvm/include/llvm/Support/ScopedPrinter.h6
-rw-r--r--llvm/include/llvm/Support/SourceMgr.h23
-rw-r--r--llvm/include/llvm/Support/SpecialCaseList.h16
-rw-r--r--llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h1
-rw-r--r--llvm/include/llvm/TableGen/CodeGenHelpers.h48
-rw-r--r--llvm/include/llvm/Target/TargetSelectionDAG.td1
-rw-r--r--llvm/include/llvm/TargetParser/RISCVTargetParser.h2
-rw-r--r--llvm/include/llvm/TargetParser/X86TargetParser.def2
-rw-r--r--llvm/include/llvm/TargetParser/X86TargetParser.h2
-rw-r--r--llvm/include/llvm/Transforms/Coroutines/MaterializationUtils.h8
-rw-r--r--llvm/include/llvm/Transforms/Coroutines/SpillUtils.h9
-rw-r--r--llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h10
-rw-r--r--llvm/include/llvm/Transforms/Utils/SSAUpdaterBulk.h5
-rw-r--r--llvm/include/llvm/XRay/BlockIndexer.h6
-rw-r--r--llvm/include/llvm/XRay/BlockPrinter.h6
-rw-r--r--llvm/include/llvm/XRay/BlockVerifier.h6
-rw-r--r--llvm/include/llvm/XRay/FDRLogBuilder.h6
-rw-r--r--llvm/include/llvm/XRay/FDRRecordConsumer.h6
-rw-r--r--llvm/include/llvm/XRay/FDRRecordProducer.h6
-rw-r--r--llvm/include/llvm/XRay/FDRRecords.h6
-rw-r--r--llvm/include/llvm/XRay/FDRTraceExpander.h6
-rw-r--r--llvm/include/llvm/XRay/FDRTraceWriter.h6
-rw-r--r--llvm/include/llvm/XRay/FileHeaderReader.h6
-rw-r--r--llvm/include/llvm/XRay/Graph.h7
-rw-r--r--llvm/include/llvm/XRay/InstrumentationMap.h19
-rw-r--r--llvm/include/llvm/XRay/Profile.h6
-rw-r--r--llvm/include/llvm/XRay/RecordPrinter.h6
-rw-r--r--llvm/include/llvm/XRay/Trace.h6
-rw-r--r--llvm/include/llvm/XRay/XRayRecord.h6
-rw-r--r--llvm/include/llvm/XRay/YAMLXRayRecord.h18
-rwxr-xr-xllvm/lib/Analysis/ConstantFolding.cpp56
-rw-r--r--llvm/lib/Analysis/DXILResource.cpp47
-rw-r--r--llvm/lib/Analysis/DependenceAnalysis.cpp57
-rw-r--r--llvm/lib/Analysis/IVDescriptors.cpp13
-rw-r--r--llvm/lib/Analysis/InstructionSimplify.cpp2
-rw-r--r--llvm/lib/Analysis/LoopInfo.cpp4
-rw-r--r--llvm/lib/Analysis/MLInlineAdvisor.cpp18
-rw-r--r--llvm/lib/Analysis/MemorySSA.cpp2
-rw-r--r--llvm/lib/Analysis/ScalarEvolution.cpp294
-rw-r--r--llvm/lib/Analysis/StaticDataProfileInfo.cpp157
-rw-r--r--llvm/lib/AsmParser/LLParser.cpp39
-rw-r--r--llvm/lib/BinaryFormat/XCOFF.cpp30
-rw-r--r--llvm/lib/Bitcode/Reader/MetadataLoader.cpp6
-rw-r--r--llvm/lib/Bitcode/Writer/BitcodeWriter.cpp1
-rw-r--r--llvm/lib/CAS/OnDiskTrieRawHashMap.cpp2
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/AIXException.cpp4
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp7
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp2
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp46
-rw-r--r--llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp4
-rw-r--r--llvm/lib/CodeGen/AtomicExpandPass.cpp2
-rw-r--r--llvm/lib/CodeGen/BasicBlockPathCloning.cpp4
-rw-r--r--llvm/lib/CodeGen/BranchRelaxation.cpp14
-rw-r--r--llvm/lib/CodeGen/BreakFalseDeps.cpp4
-rw-r--r--llvm/lib/CodeGen/CodeGenPrepare.cpp2
-rw-r--r--llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp4
-rw-r--r--llvm/lib/CodeGen/EdgeBundles.cpp11
-rw-r--r--llvm/lib/CodeGen/ExpandFp.cpp134
-rw-r--r--llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp77
-rw-r--r--llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp6
-rw-r--r--llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp6
-rw-r--r--llvm/lib/CodeGen/GlobalISel/Utils.cpp3
-rw-r--r--llvm/lib/CodeGen/GlobalMergeFunctions.cpp10
-rw-r--r--llvm/lib/CodeGen/LiveIntervals.cpp11
-rw-r--r--llvm/lib/CodeGen/MIR2Vec.cpp13
-rw-r--r--llvm/lib/CodeGen/MIRFSDiscriminator.cpp2
-rw-r--r--llvm/lib/CodeGen/MIRNamerPass.cpp17
-rw-r--r--llvm/lib/CodeGen/MIRPrinter.cpp36
-rw-r--r--llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp9
-rw-r--r--llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp18
-rw-r--r--llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp9
-rw-r--r--llvm/lib/CodeGen/MachineCopyPropagation.cpp2
-rw-r--r--llvm/lib/CodeGen/MachineFunction.cpp62
-rw-r--r--llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp8
-rw-r--r--llvm/lib/CodeGen/MachineOutliner.cpp5
-rw-r--r--llvm/lib/CodeGen/MachinePipeliner.cpp13
-rw-r--r--llvm/lib/CodeGen/MachineScheduler.cpp104
-rw-r--r--llvm/lib/CodeGen/MachineTraceMetrics.cpp7
-rw-r--r--llvm/lib/CodeGen/NonRelocatableStringpool.cpp4
-rw-r--r--llvm/lib/CodeGen/SafeStack.cpp4
-rw-r--r--llvm/lib/CodeGen/ScheduleDAGInstrs.cpp8
-rw-r--r--llvm/lib/CodeGen/ScheduleDAGPrinter.cpp80
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp102
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp64
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp17
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp1
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp1
-rw-r--r--llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp5
-rw-r--r--llvm/lib/CodeGen/StaticDataAnnotator.cpp15
-rw-r--r--llvm/lib/CodeGen/StaticDataSplitter.cpp6
-rw-r--r--llvm/lib/CodeGen/TargetLoweringBase.cpp3
-rw-r--r--llvm/lib/CodeGen/TargetRegisterInfo.cpp17
-rw-r--r--llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp4
-rw-r--r--llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp12
-rw-r--r--llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp53
-rw-r--r--llvm/lib/ExecutionEngine/Orc/CMakeLists.txt1
-rw-r--r--llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp22
-rw-r--r--llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp18
-rw-r--r--llvm/lib/ExecutionEngine/Orc/MapperJITLinkMemoryManager.cpp2
-rw-r--r--llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp32
-rw-r--r--llvm/lib/ExecutionEngine/Orc/Shared/AllocationActions.cpp22
-rw-r--r--llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp10
-rw-r--r--llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp6
-rw-r--r--llvm/lib/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.cpp104
-rw-r--r--llvm/lib/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.cpp28
-rw-r--r--llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp367
-rw-r--r--llvm/lib/IR/AsmWriter.cpp11
-rw-r--r--llvm/lib/IR/AutoUpgrade.cpp59
-rw-r--r--llvm/lib/IR/ConstantFPRange.cpp166
-rw-r--r--llvm/lib/IR/ConstantFold.cpp3
-rw-r--r--llvm/lib/IR/Constants.cpp7
-rw-r--r--llvm/lib/IR/Core.cpp11
-rw-r--r--llvm/lib/IR/DebugInfo.cpp43
-rw-r--r--llvm/lib/IR/IRBuilder.cpp13
-rw-r--r--llvm/lib/IR/Instructions.cpp16
-rw-r--r--llvm/lib/IR/Type.cpp4
-rw-r--r--llvm/lib/IR/Verifier.cpp7
-rw-r--r--llvm/lib/LTO/LTO.cpp34
-rw-r--r--llvm/lib/LTO/LTOBackend.cpp1
-rw-r--r--llvm/lib/MC/MCObjectFileInfo.cpp7
-rw-r--r--llvm/lib/ObjCopy/ConfigManager.cpp16
-rw-r--r--llvm/lib/ObjCopy/DXContainer/DXContainerObjcopy.cpp42
-rw-r--r--llvm/lib/Object/ELF.cpp10
-rw-r--r--llvm/lib/ObjectYAML/ELFEmitter.cpp8
-rw-r--r--llvm/lib/ObjectYAML/ELFYAML.cpp19
-rw-r--r--llvm/lib/Passes/PassBuilderPipelines.cpp22
-rw-r--r--llvm/lib/Passes/PassRegistry.def1
-rw-r--r--llvm/lib/ProfileData/InstrProf.cpp2
-rw-r--r--llvm/lib/Remarks/BitstreamRemarkParser.h4
-rw-r--r--llvm/lib/Support/DebugCounter.cpp56
-rw-r--r--llvm/lib/Support/PrettyStackTrace.cpp2
-rw-r--r--llvm/lib/Support/SourceMgr.cpp26
-rw-r--r--llvm/lib/Support/SpecialCaseList.cpp53
-rw-r--r--llvm/lib/Support/TextEncoding.cpp6
-rw-r--r--llvm/lib/Support/UnicodeNameToCodepoint.cpp2
-rw-r--r--llvm/lib/Support/VirtualOutputBackends.cpp2
-rw-r--r--llvm/lib/Support/Windows/Signals.inc4
-rw-r--r--llvm/lib/TableGen/Main.cpp2
-rw-r--r--llvm/lib/TableGen/Parser.cpp2
-rw-r--r--llvm/lib/TableGen/Record.cpp9
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp51
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.h1
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrGISel.td7
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.cpp3
-rw-r--r--llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp8
-rw-r--r--llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h2
-rw-r--r--llvm/lib/Target/AArch64/AArch64PostCoalescerPass.cpp4
-rw-r--r--llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp70
-rw-r--r--llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td2
-rw-r--r--llvm/lib/Target/AArch64/AArch64Subtarget.cpp2
-rw-r--r--llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp2
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp2
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp2
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp1
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h1
-rw-r--r--llvm/lib/Target/AArch64/MachineSMEABIPass.cpp108
-rw-r--r--llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp12
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp11
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/DSInstructions.td6
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp130
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h8
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td6
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.cpp132
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.h31
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp211
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.h19
-rw-r--r--llvm/lib/Target/AMDGPU/MIMGInstructions.td28
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp93
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h9
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td17
-rw-r--r--llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp49
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h3
-rw-r--r--llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp4
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp29
-rw-r--r--llvm/lib/Target/CSKY/Disassembler/CSKYDisassembler.cpp2
-rw-r--r--llvm/lib/Target/DirectX/DXContainerGlobals.cpp8
-rw-r--r--llvm/lib/Target/DirectX/DXIL.td8
-rw-r--r--llvm/lib/Target/DirectX/DXILOpBuilder.cpp8
-rw-r--r--llvm/lib/Target/DirectX/DXILOpLowering.cpp25
-rw-r--r--llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp15
-rw-r--r--llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp8
-rw-r--r--llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp25
-rw-r--r--llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp4
-rw-r--r--llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp12
-rw-r--r--llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp32
-rw-r--r--llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp3
-rw-r--r--llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp10
-rw-r--r--llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp2
-rw-r--r--llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp8
-rw-r--r--llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp4
-rw-r--r--llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h2
-rw-r--r--llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp4
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp2
-rw-r--r--llvm/lib/Target/Mips/MipsFastISel.cpp7
-rw-r--r--llvm/lib/Target/Mips/MipsISelLowering.cpp7
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp8
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp14
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXInstrInfo.td16
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXIntrinsics.td34
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp34
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXSubtarget.h34
-rw-r--r--llvm/lib/Target/PowerPC/PPCISelLowering.cpp140
-rw-r--r--llvm/lib/Target/PowerPC/PPCISelLowering.h3
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrP10.td189
-rw-r--r--llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp12
-rw-r--r--llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp12
-rw-r--r--llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp2
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h66
-rw-r--r--llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp6
-rw-r--r--llvm/lib/Target/RISCV/RISCVFeatures.td3
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp203
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h1
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp5
-rw-r--r--llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp205
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrFormats.td25
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfo.cpp8
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td47
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td9
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td194
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td51
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrPredicates.td36
-rw-r--r--llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp4
-rw-r--r--llvm/lib/Target/RISCV/RISCVSchedSiFive7.td20
-rw-r--r--llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp4
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp24
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp4
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp19
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp1
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp32
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td4
-rw-r--r--llvm/lib/Target/Sparc/SparcFrameLowering.cpp3
-rw-r--r--llvm/lib/Target/SystemZ/SystemZ.h1
-rw-r--r--llvm/lib/Target/SystemZ/SystemZISelLowering.cpp420
-rw-r--r--llvm/lib/Target/SystemZ/SystemZISelLowering.h12
-rw-r--r--llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp2
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp66
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h2
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td100
-rw-r--r--llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp11
-rw-r--r--llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp39
-rw-r--r--llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp6
-rw-r--r--llvm/lib/Target/X86/X86.td13
-rw-r--r--llvm/lib/Target/X86/X86FloatingPoint.cpp3
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp85
-rw-r--r--llvm/lib/Target/X86/X86ISelLoweringCall.cpp20
-rw-r--r--llvm/lib/Target/X86/X86InstrCompiler.td13
-rw-r--r--llvm/lib/Target/X86/X86InstrInfo.cpp22
-rw-r--r--llvm/lib/Target/X86/X86MCInstLower.cpp51
-rw-r--r--llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp16
-rw-r--r--llvm/lib/TargetParser/ARMTargetParserCommon.cpp10
-rw-r--r--llvm/lib/TargetParser/Host.cpp25
-rw-r--r--llvm/lib/TargetParser/RISCVISAInfo.cpp2
-rw-r--r--llvm/lib/TargetParser/RISCVTargetParser.cpp4
-rw-r--r--llvm/lib/TargetParser/TargetDataLayout.cpp9
-rw-r--r--llvm/lib/TargetParser/Triple.cpp166
-rw-r--r--llvm/lib/TargetParser/Unix/Host.inc22
-rw-r--r--llvm/lib/TargetParser/X86TargetParser.cpp7
-rw-r--r--llvm/lib/Transforms/CFGuard/CFGuard.cpp25
-rw-r--r--llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp22
-rw-r--r--llvm/lib/Transforms/Coroutines/CoroCloner.h9
-rw-r--r--llvm/lib/Transforms/Coroutines/CoroEarly.cpp2
-rw-r--r--llvm/lib/Transforms/Coroutines/CoroInternal.h9
-rw-r--r--llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp5
-rw-r--r--llvm/lib/Transforms/Coroutines/SpillUtils.cpp37
-rw-r--r--llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp39
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp6
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp4
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineInternal.h2
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp12
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp4
-rw-r--r--llvm/lib/Transforms/InstCombine/InstructionCombining.cpp5
-rw-r--r--llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp108
-rw-r--r--llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp24
-rw-r--r--llvm/lib/Transforms/Instrumentation/MemProfUse.cpp55
-rw-r--r--llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp5
-rw-r--r--llvm/lib/Transforms/ObjCARC/PtrState.h3
-rw-r--r--llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp8
-rw-r--r--llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp36
-rw-r--r--llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp4
-rw-r--r--llvm/lib/Transforms/Scalar/GVNSink.cpp48
-rw-r--r--llvm/lib/Transforms/Scalar/GuardWidening.cpp6
-rw-r--r--llvm/lib/Transforms/Scalar/IndVarSimplify.cpp54
-rw-r--r--llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp4
-rw-r--r--llvm/lib/Transforms/Scalar/LICM.cpp16
-rw-r--r--llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp11
-rw-r--r--llvm/lib/Transforms/Scalar/LoopFuse.cpp34
-rw-r--r--llvm/lib/Transforms/Scalar/LoopPassManager.cpp4
-rw-r--r--llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp3
-rw-r--r--llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp102
-rw-r--r--llvm/lib/Transforms/Scalar/NewGVN.cpp6
-rw-r--r--llvm/lib/Transforms/Scalar/Reassociate.cpp42
-rw-r--r--llvm/lib/Transforms/Scalar/Reg2Mem.cpp6
-rw-r--r--llvm/lib/Transforms/Scalar/SROA.cpp62
-rw-r--r--llvm/lib/Transforms/Scalar/Scalarizer.cpp8
-rw-r--r--llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp7
-rw-r--r--llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp4
-rw-r--r--llvm/lib/Transforms/Scalar/StructurizeCFG.cpp25
-rw-r--r--llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp4
-rw-r--r--llvm/lib/Transforms/Utils/CloneFunction.cpp67
-rw-r--r--llvm/lib/Transforms/Utils/LoopUnroll.cpp1
-rw-r--r--llvm/lib/Transforms/Utils/SSAUpdaterBulk.cpp105
-rw-r--r--llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp2
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp30
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp25
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.cpp14
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h41
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp1
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp27
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h1
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanHelpers.h2
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h44
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp3
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp28
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp147
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp9
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUtils.cpp13
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUtils.h2
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp23
-rw-r--r--llvm/lib/XRay/BlockIndexer.cpp7
-rw-r--r--llvm/lib/XRay/BlockPrinter.cpp7
-rw-r--r--llvm/lib/XRay/BlockVerifier.cpp18
-rw-r--r--llvm/lib/XRay/FDRRecordProducer.cpp14
-rw-r--r--llvm/lib/XRay/FDRRecords.cpp7
-rw-r--r--llvm/lib/XRay/FDRTraceExpander.cpp7
-rw-r--r--llvm/lib/XRay/FDRTraceWriter.cpp12
-rw-r--r--llvm/lib/XRay/FileHeaderReader.cpp12
-rw-r--r--llvm/lib/XRay/LogBuilderConsumer.cpp7
-rw-r--r--llvm/lib/XRay/Profile.cpp18
-rw-r--r--llvm/lib/XRay/RecordInitializer.cpp7
-rw-r--r--llvm/lib/XRay/RecordPrinter.cpp7
-rw-r--r--llvm/lib/XRay/Trace.cpp18
-rw-r--r--llvm/test/Analysis/BasicAA/intrinsics.ll15
-rw-r--r--llvm/test/Analysis/BasicAA/scalable-dse-aa.ll149
-rw-r--r--llvm/test/Analysis/CostModel/AArch64/cast.ll1924
-rw-r--r--llvm/test/Analysis/CostModel/AArch64/no-sve-no-neon.ll6
-rw-r--r--llvm/test/Analysis/CostModel/AArch64/sve-cast.ll3876
-rw-r--r--llvm/test/Analysis/CostModel/AArch64/sve-ext.ll80
-rw-r--r--llvm/test/Analysis/CostModel/AArch64/sve-fpext.ll60
-rw-r--r--llvm/test/Analysis/CostModel/AArch64/sve-fptoi.ll288
-rw-r--r--llvm/test/Analysis/CostModel/AArch64/sve-fptrunc.ll84
-rw-r--r--llvm/test/Analysis/CostModel/AArch64/sve-illegal-types.ll25
-rw-r--r--llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll744
-rw-r--r--llvm/test/Analysis/CostModel/AArch64/sve-trunc.ll128
-rw-r--r--llvm/test/Analysis/DXILResource/buffer-frombinding.ll4
-rw-r--r--llvm/test/Analysis/DependenceAnalysis/becount-couldnotcompute.ll22
-rw-r--r--llvm/test/Analysis/ScalarEvolution/ne-guard-multiple-trip-count.ll72
-rw-r--r--llvm/test/Analysis/ScalarEvolution/ptrtoaddr.ll135
-rw-r--r--llvm/test/Analysis/ScalarEvolution/ptrtoint.ll88
-rw-r--r--llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll28
-rw-r--r--llvm/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll22
-rw-r--r--llvm/test/Assembler/autoupgrade-lifetime-intrinsics.ll39
-rw-r--r--llvm/test/Assembler/autoupgrade-wasm-intrinsics.ll5
-rw-r--r--llvm/test/Assembler/dicompileunit-invalid-language-version.ll25
-rw-r--r--llvm/test/Assembler/implicit-intrinsic-declaration-invalid3.ll2
-rw-r--r--llvm/test/Bindings/llvm-c/debug_info_new_format.ll107
-rw-r--r--llvm/test/Bitcode/Inputs/compile-unit-no-versioned-language.bcbin0 -> 1760 bytes
-rw-r--r--llvm/test/Bitcode/dwarf-source-language-version.ll17
-rw-r--r--llvm/test/Bitcode/upgrade-DICompileUnit-no-versioned-language.test21
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/knownbits-add.mir278
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/knownbits-ashr.mir4
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/knownbits-shl.mir4
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/knownbits-sub.mir276
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/legalize-vector-compress.mir6
-rw-r--r--llvm/test/CodeGen/AArch64/aarch64-matmul.ll87
-rw-r--r--llvm/test/CodeGen/AArch64/aarch64-post-coalescer.mir16
-rw-r--r--llvm/test/CodeGen/AArch64/adds_cmn.ll6
-rw-r--r--llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmove-fpr.ll (renamed from llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr.ll)6
-rw-r--r--llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmove-gpr.ll (renamed from llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-gpr.ll)2
-rw-r--r--llvm/test/CodeGen/AArch64/combine-sdiv.ll85
-rw-r--r--llvm/test/CodeGen/AArch64/framelayout-sve-win.mir30
-rw-r--r--llvm/test/CodeGen/AArch64/machine-sme-abi-find-insert-pt.mir227
-rw-r--r--llvm/test/CodeGen/AArch64/mir-yaml-has-streaming-mode-changes.ll13
-rw-r--r--llvm/test/CodeGen/AArch64/sat-add.ll6
-rw-r--r--llvm/test/CodeGen/AArch64/sme-agnostic-za.ll4
-rw-r--r--llvm/test/CodeGen/AArch64/sme-lazy-sve-nzcv-live.mir12
-rw-r--r--llvm/test/CodeGen/AArch64/sme-za-exceptions.ll242
-rw-r--r--llvm/test/CodeGen/AArch64/sve-fp-reduce.ll178
-rw-r--r--llvm/test/CodeGen/AArch64/sve-int-reduce.ll125
-rw-r--r--llvm/test/CodeGen/AArch64/win-sve.ll148
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll9
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll11
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll280
-rw-r--r--llvm/test/CodeGen/AMDGPU/abs_i16.ll980
-rw-r--r--llvm/test/CodeGen/AMDGPU/add.v2i16.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll448
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll31665
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll3231
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll6882
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll3176
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll888
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll362
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll11297
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll2484
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll4594
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll1795
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll4962
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll5336
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll5688
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll6014
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll6338
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll1806
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll140
-rw-r--r--llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll28
-rw-r--r--llvm/test/CodeGen/AMDGPU/bypass-div.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/calling-conventions.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll34
-rw-r--r--llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll5
-rw-r--r--llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir87
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmed3.ll275
-rw-r--r--llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll335
-rw-r--r--llvm/test/CodeGen/AMDGPU/fneg.bf16.ll123
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptosi.f16.ll7
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptoui.f16.ll7
-rw-r--r--llvm/test/CodeGen/AMDGPU/frem.ll1365
-rw-r--r--llvm/test/CodeGen/AMDGPU/function-args.ll190
-rw-r--r--llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll152
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-constant.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-variable-relocs.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/idot4u.ll13
-rw-r--r--llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll1
-rw-r--r--llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir1854
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll96
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll204
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-constant-i8.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/minmax.ll24
-rw-r--r--llvm/test/CodeGen/AMDGPU/naked-fn-with-frame-pointer.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/readsteadycounter.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll239
-rw-r--r--llvm/test/CodeGen/AMDGPU/sched.group.classification.mir59
-rw-r--r--llvm/test/CodeGen/AMDGPU/schedule-pending-queue.mir32
-rw-r--r--llvm/test/CodeGen/AMDGPU/sdiv.ll788
-rw-r--r--llvm/test/CodeGen/AMDGPU/select.f16.ll72
-rw-r--r--llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir9
-rw-r--r--llvm/test/CodeGen/AMDGPU/srem.ll26
-rw-r--r--llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/sub.v2i16.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll22
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll80
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll18
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll18
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll29
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll20
-rw-r--r--llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir7
-rw-r--r--llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll2
-rw-r--r--llvm/test/CodeGen/ARM/call-graph-section-assembly.ll6
-rw-r--r--llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll6
-rw-r--r--llvm/test/CodeGen/ARM/call-graph-section.ll6
-rw-r--r--llvm/test/CodeGen/ARM/carry.ll87
-rw-r--r--llvm/test/CodeGen/ARM/nnan-fsub.ll20
-rw-r--r--llvm/test/CodeGen/DirectX/CBufferAccess/memcpy.ll8
-rw-r--r--llvm/test/CodeGen/DirectX/CBufferLoadLegacy-errors.ll12
-rw-r--r--llvm/test/CodeGen/DirectX/CBufferLoadLegacy.ll12
-rw-r--r--llvm/test/CodeGen/DirectX/ContainerData/PSVResources-order.ll2
-rw-r--r--llvm/test/CodeGen/DirectX/ContainerData/PSVResources.ll14
-rw-r--r--llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll2
-rw-r--r--llvm/test/CodeGen/DirectX/ForwardHandleAccesses/cbuffer-access.ll20
-rw-r--r--llvm/test/CodeGen/DirectX/Metadata/cbuffer-layouttype.ll (renamed from llvm/test/CodeGen/DirectX/Metadata/cbuffer_metadata.ll)13
-rw-r--r--llvm/test/CodeGen/DirectX/Metadata/cbuffer-metadata.ll89
-rw-r--r--llvm/test/CodeGen/DirectX/Metadata/cbuffer-only.ll2
-rw-r--r--llvm/test/CodeGen/DirectX/bufferGetDimensions.ll16
-rw-r--r--llvm/test/CodeGen/Hexagon/swp-many-stores.mir88
-rw-r--r--llvm/test/CodeGen/LoongArch/calling-conv-half.ll167
-rw-r--r--llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll5
-rw-r--r--llvm/test/CodeGen/LoongArch/lasx/vselect.ll125
-rw-r--r--llvm/test/CodeGen/LoongArch/lsx/vselect.ll137
-rw-r--r--llvm/test/CodeGen/MIR/AArch64/return-address-signing.mir2
-rw-r--r--llvm/test/CodeGen/NVPTX/i32x2-instructions.ll167
-rw-r--r--llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll4
-rw-r--r--llvm/test/CodeGen/NVPTX/tcgen05-commit.ll4
-rw-r--r--llvm/test/CodeGen/NVPTX/tcgen05-cp.ll4
-rw-r--r--llvm/test/CodeGen/NVPTX/tcgen05-fence.ll4
-rw-r--r--llvm/test/CodeGen/NVPTX/tcgen05-ld.ll4
-rw-r--r--llvm/test/CodeGen/NVPTX/tcgen05-shift.ll2
-rw-r--r--llvm/test/CodeGen/NVPTX/tcgen05-st.ll4
-rw-r--r--llvm/test/CodeGen/PowerPC/addition-vector-all-ones.ll60
-rw-r--r--llvm/test/CodeGen/PowerPC/compare-vector-with-zero.ll (renamed from llvm/test/CodeGen/PowerPC/check-zero-vector.ll)77
-rw-r--r--llvm/test/CodeGen/PowerPC/fmf-propagation.ll90
-rw-r--r--llvm/test/CodeGen/PowerPC/lxvkq-vec-constant.ll307
-rw-r--r--llvm/test/CodeGen/PowerPC/vector-all-ones.ll23
-rw-r--r--llvm/test/CodeGen/PowerPC/vector-reduce-add.ll17
-rw-r--r--llvm/test/CodeGen/PowerPC/xxeval-vselect-x-eqv.ll63
-rw-r--r--llvm/test/CodeGen/PowerPC/xxeval-vselect-x-nand.ll72
-rw-r--r--llvm/test/CodeGen/PowerPC/xxeval-vselect-x-nor.ll104
-rw-r--r--llvm/test/CodeGen/PowerPC/xxeval-vselect-x-not-b.ll64
-rw-r--r--llvm/test/CodeGen/PowerPC/xxeval-vselect-x-not-c.ll92
-rw-r--r--llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir4
-rw-r--r--llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insertelement-rv32.mir1742
-rw-r--r--llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insertelement-rv64.mir1731
-rw-r--r--llvm/test/CodeGen/RISCV/and-negpow2-cmp.ll4
-rw-r--r--llvm/test/CodeGen/RISCV/attributes.ll4
-rw-r--r--llvm/test/CodeGen/RISCV/branch-rel.mir39
-rw-r--r--llvm/test/CodeGen/RISCV/div_minsize.ll148
-rw-r--r--llvm/test/CodeGen/RISCV/i64-icmp.ll6
-rw-r--r--llvm/test/CodeGen/RISCV/idiv_large.ll2311
-rw-r--r--llvm/test/CodeGen/RISCV/min-max.ll634
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/regcoal-liveinterval-pruning-crash.ll76
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/regcoal-liveinterval-pruning-crash.mir57
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/sifive-O0-ATM-ATK.ll18
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/sifive-xsfmm-vset-insert.mir523
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e4m3_e4m3.ll20
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e4m3_e5m2.ll20
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e5m2_e4m3.ll20
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e5m2_e5m2.ll20
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_f_f.ll52
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_s_s.ll20
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_s_u.ll20
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_u_s.ll20
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_u_u.ll20
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte16.ll23
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte32.ll23
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte64.ll23
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte8.ll23
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/sifive_sf_vsettk.ll23
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/sifive_sf_vsettm.ll23
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/sifive_sf_vsettnt.ll72
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste16.ll23
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste32.ll23
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste64.ll23
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste8.ll23
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtdiscard.ll22
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtmv_t_v.ll114
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtmv_v_t.ll114
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtzero_t.ll24
-rw-r--r--llvm/test/CodeGen/RISCV/select-to-and-zext.ll6
-rw-r--r--llvm/test/CodeGen/RISCV/setcc-logic.ll5
-rw-r--r--llvm/test/CodeGen/RISCV/sext-zext-trunc.ll6
-rw-r--r--llvm/test/CodeGen/RISCV/xaluo.ll12
-rw-r--r--llvm/test/CodeGen/SPIRV/FCmpFalse.ll10
-rw-r--r--llvm/test/CodeGen/SPIRV/FCmpFalse_Vec.ll13
-rw-r--r--llvm/test/CodeGen/SPIRV/builtin_duplicate.ll20
-rw-r--r--llvm/test/CodeGen/SPIRV/complex-constexpr.ll21
-rw-r--r--llvm/test/CodeGen/SPIRV/dominator-order.ll25
-rw-r--r--llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_maximal_reconvergence/enable-maximal-reconvergence.ll21
-rw-r--r--llvm/test/CodeGen/SPIRV/hlsl-resources/DynamicIdx/RWBufferDynamicIdx.ll22
-rw-r--r--llvm/test/CodeGen/SPIRV/hlsl-resources/DynamicIdx/RWStructuredBufferDynamicIdx.ll21
-rw-r--r--llvm/test/CodeGen/SPIRV/hlsl-resources/NonUniformIdx/RWBufferNonUniformIdx.ll (renamed from llvm/test/CodeGen/SPIRV/hlsl-resources/NonUniformIdx/StructuredBufferNonUniformIdx.ll)0
-rw-r--r--llvm/test/CodeGen/SPIRV/hlsl-resources/NonUniformIdx/RWStructuredBufferNonUniformIdx.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageConstIdx.ll (renamed from llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageDynIdx.ll)2
-rw-r--r--llvm/test/CodeGen/SPIRV/llvm-compiler-used.ll19
-rw-r--r--llvm/test/CodeGen/SPIRV/llvm-intrinsics/fake_use.ll13
-rw-r--r--llvm/test/CodeGen/SPIRV/transcoding/AtomicCompareExchange_cl20.ll84
-rw-r--r--llvm/test/CodeGen/SystemZ/htm-intrinsics.ll4
-rw-r--r--llvm/test/CodeGen/SystemZ/inline-asm-flag-output-01.ll738
-rw-r--r--llvm/test/CodeGen/SystemZ/inline-asm-flag-output-02.ll1665
-rw-r--r--llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll473
-rw-r--r--llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll233
-rw-r--r--llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll135
-rw-r--r--llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll57
-rw-r--r--llvm/test/CodeGen/Thumb2/carry.ll59
-rw-r--r--llvm/test/CodeGen/WebAssembly/bulk-memory.ll97
-rw-r--r--llvm/test/CodeGen/WebAssembly/bulk-memory64.ll91
-rw-r--r--llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll208
-rw-r--r--llvm/test/CodeGen/WebAssembly/mem-intrinsics-offsets.ll48
-rw-r--r--llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll106
-rw-r--r--llvm/test/CodeGen/WebAssembly/simd-relaxed-dot.ll104
-rw-r--r--llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll1309
-rw-r--r--llvm/test/CodeGen/WebAssembly/simd-relaxed-fnma.ll67
-rw-r--r--llvm/test/CodeGen/X86/2007-08-09-IllegalX86-64Asm.ll109
-rw-r--r--llvm/test/CodeGen/X86/GlobalISel/add-scalar.ll12
-rw-r--r--llvm/test/CodeGen/X86/GlobalISel/legalize-add.mir20
-rw-r--r--llvm/test/CodeGen/X86/GlobalISel/legalize-leading-zeros.mir15
-rw-r--r--llvm/test/CodeGen/X86/GlobalISel/legalize-sub.mir20
-rw-r--r--llvm/test/CodeGen/X86/GlobalISel/legalize-trailing-zeros-undef.mir8
-rw-r--r--llvm/test/CodeGen/X86/GlobalISel/legalize-trailing-zeros.mir8
-rw-r--r--llvm/test/CodeGen/X86/GlobalISel/pr49087.ll50
-rw-r--r--llvm/test/CodeGen/X86/GlobalISel/regbankselect-X32.mir4
-rw-r--r--llvm/test/CodeGen/X86/GlobalISel/select-add-x32.mir27
-rw-r--r--llvm/test/CodeGen/X86/GlobalISel/select-get-carry-bit.ll21
-rw-r--r--llvm/test/CodeGen/X86/GlobalISel/sub-scalar.ll12
-rw-r--r--llvm/test/CodeGen/X86/absolute-symbol-kernel-code-model.ll34
-rw-r--r--llvm/test/CodeGen/X86/apx/cf.ll18
-rw-r--r--llvm/test/CodeGen/X86/avg.ll74
-rw-r--r--llvm/test/CodeGen/X86/avx-shift.ll2
-rw-r--r--llvm/test/CodeGen/X86/avx2-arith.ll4
-rw-r--r--llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll2
-rw-r--r--llvm/test/CodeGen/X86/call-graph-section-assembly.ll6
-rw-r--r--llvm/test/CodeGen/X86/call-graph-section-tailcall.ll9
-rw-r--r--llvm/test/CodeGen/X86/call-graph-section.ll9
-rw-r--r--llvm/test/CodeGen/X86/combine-mul.ll22
-rw-r--r--llvm/test/CodeGen/X86/combine-multiplies.ll4
-rw-r--r--llvm/test/CodeGen/X86/combine-pmuldq.ll24
-rw-r--r--llvm/test/CodeGen/X86/combine-rotates.ll4
-rw-r--r--llvm/test/CodeGen/X86/combine-sdiv.ll8
-rw-r--r--llvm/test/CodeGen/X86/combine-shl.ll54
-rw-r--r--llvm/test/CodeGen/X86/combine-srem.ll10
-rw-r--r--llvm/test/CodeGen/X86/combine-udiv.ll28
-rw-r--r--llvm/test/CodeGen/X86/combine-urem.ll4
-rw-r--r--llvm/test/CodeGen/X86/cpus-intel.ll4
-rw-r--r--llvm/test/CodeGen/X86/dagcombine-shifts.ll4
-rw-r--r--llvm/test/CodeGen/X86/funnel-shift.ll8
-rw-r--r--llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll99
-rw-r--r--llvm/test/CodeGen/X86/global-variable-partition.ll18
-rw-r--r--llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll8
-rw-r--r--llvm/test/CodeGen/X86/known-pow2.ll6
-rw-r--r--llvm/test/CodeGen/X86/madd.ll10
-rw-r--r--llvm/test/CodeGen/X86/masked_gather_scatter.ll60
-rw-r--r--llvm/test/CodeGen/X86/min-legal-vector-width.ll57
-rw-r--r--llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll16
-rw-r--r--llvm/test/CodeGen/X86/pmul.ll2
-rw-r--r--llvm/test/CodeGen/X86/pr160612.ll74
-rw-r--r--llvm/test/CodeGen/X86/pr162812.ll50
-rw-r--r--llvm/test/CodeGen/X86/pr49087.ll30
-rw-r--r--llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll2
-rw-r--r--llvm/test/CodeGen/X86/relptr-rodata.ll15
-rw-r--r--llvm/test/CodeGen/X86/rotate-extract-vector.ll38
-rw-r--r--llvm/test/CodeGen/X86/sdiv-exact.ll18
-rw-r--r--llvm/test/CodeGen/X86/setcc-wide-types.ll172
-rw-r--r--llvm/test/CodeGen/X86/shrink_vmul.ll20
-rw-r--r--llvm/test/CodeGen/X86/slow-pmulld.ll8
-rw-r--r--llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll4
-rw-r--r--llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll1
-rw-r--r--llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll388
-rw-r--r--llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll24
-rw-r--r--llvm/test/CodeGen/X86/udiv-exact.ll18
-rw-r--r--llvm/test/CodeGen/X86/undo-mul-and.ll18
-rw-r--r--llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll9
-rw-r--r--llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll470
-rw-r--r--llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll20
-rw-r--r--llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll36
-rw-r--r--llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll20
-rw-r--r--llvm/test/CodeGen/X86/var-permute-128.ll63
-rw-r--r--llvm/test/CodeGen/X86/vec_reassociate.ll8
-rw-r--r--llvm/test/CodeGen/X86/vector-fshl-128.ll12
-rw-r--r--llvm/test/CodeGen/X86/vector-fshl-256.ll4
-rw-r--r--llvm/test/CodeGen/X86/vector-fshl-rot-128.ll16
-rw-r--r--llvm/test/CodeGen/X86/vector-fshl-rot-256.ll8
-rw-r--r--llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll16
-rw-r--r--llvm/test/CodeGen/X86/vector-fshl-sub128.ll12
-rw-r--r--llvm/test/CodeGen/X86/vector-fshr-128.ll16
-rw-r--r--llvm/test/CodeGen/X86/vector-fshr-256.ll4
-rw-r--r--llvm/test/CodeGen/X86/vector-fshr-rot-128.ll16
-rw-r--r--llvm/test/CodeGen/X86/vector-fshr-rot-256.ll8
-rw-r--r--llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll16
-rw-r--r--llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll36
-rw-r--r--llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll32
-rw-r--r--llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll32
-rw-r--r--llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll56
-rw-r--r--llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll56
-rw-r--r--llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll56
-rw-r--r--llvm/test/CodeGen/X86/vector-mul.ll100
-rw-r--r--llvm/test/CodeGen/X86/vector-reduce-add-mask.ll2
-rw-r--r--llvm/test/CodeGen/X86/vector-rotate-128.ll16
-rw-r--r--llvm/test/CodeGen/X86/vector-rotate-256.ll8
-rw-r--r--llvm/test/CodeGen/X86/vector-shift-ashr-128.ll4
-rw-r--r--llvm/test/CodeGen/X86/vector-shift-lshr-128.ll4
-rw-r--r--llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll4
-rw-r--r--llvm/test/CodeGen/X86/vector-shift-shl-128.ll12
-rw-r--r--llvm/test/CodeGen/X86/vector-shift-shl-256.ll8
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll8
-rw-r--r--llvm/test/CodeGen/X86/vector-trunc-math.ll92
-rw-r--r--llvm/test/CodeGen/X86/vselect-avx.ll2
-rw-r--r--llvm/test/CodeGen/X86/vselect-pcmp.ll6
-rw-r--r--llvm/test/CodeGen/X86/x86-shrink-wrap-unwind.ll52
-rw-r--r--llvm/test/CodeGen/X86/zero-call-used-regs-simd.ll216
-rw-r--r--llvm/test/DebugInfo/AArch64/callsite.mir68
-rw-r--r--llvm/test/DebugInfo/COFF/AArch64/codeview-sve.ll2
-rw-r--r--llvm/test/DebugInfo/X86/shrink-wrap-frame-setup-no-loc.mir99
-rw-r--r--llvm/test/Instrumentation/AddressSanitizer/asan-funclet.ll177
-rw-r--r--llvm/test/Instrumentation/AddressSanitizer/asan-win-dont-instrument-catchpad.ll63
-rw-r--r--llvm/test/Instrumentation/AddressSanitizer/fake-stack.ll88
-rw-r--r--llvm/test/Instrumentation/AddressSanitizer/lifetime.ll92
-rw-r--r--llvm/test/Instrumentation/AddressSanitizer/local_stack_base.ll5
-rw-r--r--llvm/test/Instrumentation/AddressSanitizer/stack-poisoning-byval-args.ll6
-rw-r--r--llvm/test/Instrumentation/AddressSanitizer/stack_dynamic_alloca.ll7
-rw-r--r--llvm/test/LTO/X86/memprof-supports-hot-cold-new.ll21
-rw-r--r--llvm/test/MC/AArch64/data-directive-specifier.s3
-rw-r--r--llvm/test/MC/AMDGPU/gfx1250_asm_salu_lit64.s8
-rw-r--r--llvm/test/MC/AMDGPU/gfx1250_asm_vds_alias.s12
-rw-r--r--llvm/test/MC/AMDGPU/gfx1250_asm_vflat_alias.s75
-rw-r--r--llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s8
-rw-r--r--llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s15
-rw-r--r--llvm/test/MC/AMDGPU/gfx12_asm_sop1.s103
-rw-r--r--llvm/test/MC/AMDGPU/gfx12_asm_sop2.s103
-rw-r--r--llvm/test/MC/AMDGPU/gfx12_asm_sopc.s10
-rw-r--r--llvm/test/MC/AMDGPU/lit.local.cfg2
-rw-r--r--llvm/test/MC/AMDGPU/offset-expr.s8
-rw-r--r--llvm/test/MC/Disassembler/AMDGPU/gfx8-literal16.txt3
-rw-r--r--llvm/test/MC/Disassembler/AMDGPU/gfx8_vop3cx_nowarn.txt422
-rw-r--r--llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3c_nowarn.txt402
-rw-r--r--llvm/test/MC/Disassembler/AMDGPU/literals.txt30
-rw-r--r--llvm/test/MC/Disassembler/X86/apx/pushp-popp.txt8
-rw-r--r--llvm/test/MC/WebAssembly/simd-encodings.s8
-rw-r--r--llvm/test/MC/X86/apx/pushp-popp-att.s8
-rw-r--r--llvm/test/MC/X86/verify-callgraph-section.s4
-rw-r--r--llvm/test/Other/debugcounter-dce.ll10
-rw-r--r--llvm/test/TableGen/directive1.td1
-rw-r--r--llvm/test/TableGen/directive2.td1
-rw-r--r--llvm/test/TableGen/listsplat.td6
-rw-r--r--llvm/test/ThinLTO/X86/memprof-supports-hot-cold-new.ll18
-rw-r--r--llvm/test/Transforms/DFAJumpThreading/dfa-constant-propagation.ll2
-rw-r--r--llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-analysis.ll4
-rw-r--r--llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-transform.ll60
-rw-r--r--llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll2
-rw-r--r--llvm/test/Transforms/DFAJumpThreading/equivalent-states.ll2
-rw-r--r--llvm/test/Transforms/DFAJumpThreading/single_succ_switch.ll2
-rw-r--r--llvm/test/Transforms/ExpandFp/AMDGPU/frem.ll280
-rw-r--r--llvm/test/Transforms/GVN/PRE/pre-load.ll48
-rw-r--r--llvm/test/Transforms/IndVarSimplify/X86/overflow-intrinsics.ll16
-rw-r--r--llvm/test/Transforms/IndVarSimplify/pointer-loop-guards.ll168
-rw-r--r--llvm/test/Transforms/IndVarSimplify/unreachable-exit.ll738
-rw-r--r--llvm/test/Transforms/Inline/ML/state-accounting-skip-non-cold.ll55
-rw-r--r--llvm/test/Transforms/InstCombine/add-sitofp.ll7
-rw-r--r--llvm/test/Transforms/InstCombine/binop-itofp.ll22
-rw-r--r--llvm/test/Transforms/InstCombine/cast-set-preserve-signed-dbg-val.ll4
-rw-r--r--llvm/test/Transforms/InstCombine/icmp-trunc.ll30
-rw-r--r--llvm/test/Transforms/InstCombine/ptr-int-cast.ll11
-rw-r--r--llvm/test/Transforms/InstCombine/ptrtoaddr.ll5
-rw-r--r--llvm/test/Transforms/InstCombine/select-safe-impliedcond-transforms.ll15
-rw-r--r--llvm/test/Transforms/InstSimplify/ConstProp/WebAssembly/any_all_true.ll1
-rw-r--r--llvm/test/Transforms/InstSimplify/ConstProp/bitcount.ll17
-rw-r--r--llvm/test/Transforms/InstSimplify/ConstProp/bitreverse.ll51
-rw-r--r--llvm/test/Transforms/InstSimplify/ConstProp/bswap.ll17
-rw-r--r--llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll1
-rw-r--r--llvm/test/Transforms/InstSimplify/ptrmask.ll20
-rw-r--r--llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll230
-rw-r--r--llvm/test/Transforms/LoopUnroll/scevunroll.ll3
-rw-r--r--llvm/test/Transforms/LoopVectorize/12-12-11-if-conv.ll6
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll13
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll18
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll19
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll30
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll2
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll172
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll7
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll10
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll5
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll10
-rw-r--r--llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll4
-rw-r--r--llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll10
-rw-r--r--llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll40
-rw-r--r--llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll10
-rw-r--r--llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll32
-rw-r--r--llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll12
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/avx1.ll12
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll6
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll22
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/cost-model.ll10
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll8
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/unroll-pm.ll4
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll22
-rw-r--r--llvm/test/Transforms/LoopVectorize/assume.ll20
-rw-r--r--llvm/test/Transforms/LoopVectorize/bsd_regex.ll6
-rw-r--r--llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll12
-rw-r--r--llvm/test/Transforms/LoopVectorize/if-conversion.ll10
-rw-r--r--llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll12
-rw-r--r--llvm/test/Transforms/LoopVectorize/memdep.ll4
-rw-r--r--llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll9
-rw-r--r--llvm/test/Transforms/LoopVectorize/operand-bundles.ll227
-rw-r--r--llvm/test/Transforms/LoopVectorize/partial-lcssa.ll4
-rw-r--r--llvm/test/Transforms/LoopVectorize/pr28541.ll4
-rw-r--r--llvm/test/Transforms/LoopVectorize/pr48832.ll2
-rw-r--r--llvm/test/Transforms/LoopVectorize/reduction-minmax-users-and-predicated.ll588
-rw-r--r--llvm/test/Transforms/LoopVectorize/runtime-check.ll16
-rw-r--r--llvm/test/Transforms/LoopVectorize/scalable-assume.ll6
-rw-r--r--llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll103
-rw-r--r--llvm/test/Transforms/LoopVectorize/write-only.ll12
-rw-r--r--llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll365
-rw-r--r--llvm/test/Transforms/LowerMatrixIntrinsics/data-layout.ll154
-rw-r--r--llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder-rm.ll96
-rw-r--r--llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder.ll96
-rw-r--r--llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll9
-rw-r--r--llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll9
-rw-r--r--llvm/test/Transforms/PGOProfile/data-access-profile.ll83
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll16
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/gather-with-cmp-user.ll6
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/identity-match-splat-less-defined.ll27
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/last-non-copyable-inst-used-outside-bb.ll89
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll8
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll6
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll9
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll47
-rw-r--r--llvm/test/Transforms/SLPVectorizer/minbitwidth-node-with-multi-users.ll14
-rw-r--r--llvm/test/Transforms/SROA/phi-and-select.ll22
-rw-r--r--llvm/test/Transforms/SROA/phi-gep.ll29
-rw-r--r--llvm/test/Transforms/SROA/select-gep.ll19
-rw-r--r--llvm/test/Transforms/SROA/slice-width.ll19
-rw-r--r--llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll2
-rw-r--r--llvm/test/Verifier/llvm.used-invalid-init.ll2
-rw-r--r--llvm/test/Verifier/matrix-intrinsics.ll23
-rw-r--r--llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.generated.expected8
-rw-r--r--llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.nogenerated.expected8
-rw-r--r--llvm/test/tools/llvm-mca/RISCV/SiFive7/vrgather-vcompress.s (renamed from llvm/test/tools/llvm-mca/RISCV/SiFive7/vgather-vcompress.s)0
-rw-r--r--llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vrgather-vcompress.s (renamed from llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vgather-vcompress.s)0
-rw-r--r--llvm/test/tools/llvm-objcopy/DXContainer/dump-section-errors.yaml27
-rw-r--r--llvm/test/tools/llvm-objcopy/DXContainer/dump-section.yaml278
-rw-r--r--llvm/test/tools/llvm-objdump/ELF/Hexagon/packet-reset-on-label.s23
-rw-r--r--llvm/test/tools/llvm-readobj/ELF/bb-addr-map.test7
-rw-r--r--llvm/test/tools/obj2yaml/ELF/bb-addr-map.yaml86
-rw-r--r--llvm/test/tools/yaml2obj/ELF/bb-addr-map.yaml27
-rw-r--r--llvm/tools/bugpoint/BugDriver.cpp18
-rw-r--r--llvm/tools/bugpoint/BugDriver.h37
-rw-r--r--llvm/tools/bugpoint/CrashDebugger.cpp124
-rw-r--r--llvm/tools/bugpoint/ExecutionDriver.cpp57
-rw-r--r--llvm/tools/bugpoint/ExtractFunction.cpp64
-rw-r--r--llvm/tools/bugpoint/Miscompilation.cpp40
-rw-r--r--llvm/tools/bugpoint/OptimizerDriver.cpp2
-rw-r--r--llvm/tools/bugpoint/ToolRunner.cpp32
-rw-r--r--llvm/tools/bugpoint/bugpoint.cpp2
-rw-r--r--llvm/tools/llc/llc.cpp5
-rw-r--r--llvm/tools/lli/lli.cpp397
-rw-r--r--llvm/tools/llvm-c-test/debuginfo.c5
-rw-r--r--llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp4
-rw-r--r--llvm/tools/llvm-jitlink/llvm-jitlink.cpp63
-rw-r--r--llvm/tools/llvm-mc-assemble-fuzzer/llvm-mc-assemble-fuzzer.cpp2
-rw-r--r--llvm/tools/llvm-mc/llvm-mc.cpp2
-rw-r--r--llvm/tools/llvm-ml/llvm-ml.cpp2
-rw-r--r--llvm/tools/llvm-objdump/llvm-objdump.cpp8
-rw-r--r--llvm/tools/llvm-readobj/ELFDumper.cpp2
-rw-r--r--llvm/tools/obj2yaml/elf2yaml.cpp7
-rw-r--r--llvm/unittests/ADT/BitFieldsTest.cpp4
-rw-r--r--llvm/unittests/ADT/SmallVectorTest.cpp16
-rw-r--r--llvm/unittests/ADT/StringExtrasTest.cpp6
-rw-r--r--llvm/unittests/ADT/StringSwitchTest.cpp13
-rw-r--r--llvm/unittests/ADT/TypeTraitsTest.cpp4
-rw-r--r--llvm/unittests/Analysis/DXILResourceTest.cpp4
-rw-r--r--llvm/unittests/Analysis/ScalarEvolutionTest.cpp12
-rw-r--r--llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp51
-rw-r--r--llvm/unittests/CodeGen/GlobalISel/LegalizerInfoTest.cpp14
-rw-r--r--llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp186
-rw-r--r--llvm/unittests/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManagerTest.cpp27
-rw-r--r--llvm/unittests/ExecutionEngine/Orc/ExecutorAddressTest.cpp6
-rw-r--r--llvm/unittests/ExecutionEngine/Orc/MapperJITLinkMemoryManagerTest.cpp4
-rw-r--r--llvm/unittests/ExecutionEngine/Orc/MemoryMapperTest.cpp10
-rw-r--r--llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp8
-rw-r--r--llvm/unittests/ExecutionEngine/Orc/SharedMemoryMapperTest.cpp7
-rw-r--r--llvm/unittests/ExecutionEngine/Orc/SimpleExecutorMemoryManagerTest.cpp20
-rw-r--r--llvm/unittests/Frontend/OpenMPDecompositionTest.cpp16
-rw-r--r--llvm/unittests/IR/ConstantFPRangeTest.cpp176
-rw-r--r--llvm/unittests/IR/ConstantsTest.cpp14
-rw-r--r--llvm/unittests/IR/InstructionsTest.cpp40
-rw-r--r--llvm/unittests/IR/RuntimeLibcallsTest.cpp12
-rw-r--r--llvm/unittests/Object/ELFObjectFileTest.cpp178
-rw-r--r--llvm/unittests/Object/ELFTypesTest.cpp35
-rw-r--r--llvm/unittests/Support/CMakeLists.txt1
-rw-r--r--llvm/unittests/Support/Format.cpp56
-rw-r--r--llvm/unittests/Support/SourceMgrTest.cpp11
-rw-r--r--llvm/unittests/Target/AArch64/AArch64SelectionDAGTest.cpp159
-rw-r--r--llvm/unittests/Transforms/Utils/SSAUpdaterBulkTest.cpp220
-rw-r--r--llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp9
-rwxr-xr-xllvm/utils/Misc/zkill14
-rw-r--r--llvm/utils/TableGen/Basic/DirectiveEmitter.cpp6
-rw-r--r--llvm/utils/TableGen/Basic/RISCVTargetDefEmitter.cpp28
-rw-r--r--llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp2
-rw-r--r--llvm/utils/TableGen/Common/Types.cpp8
-rw-r--r--llvm/utils/TableGen/FastISelEmitter.cpp2
-rw-r--r--llvm/utils/TableGen/X86DisassemblerTables.cpp11
-rw-r--r--llvm/utils/TableGen/X86RecognizableInstr.cpp2
-rwxr-xr-xllvm/utils/clang-parse-diagnostics-file10
-rwxr-xr-xllvm/utils/git/code-format-helper.py9
-rw-r--r--llvm/utils/gn/secondary/clang/lib/Analysis/BUILD.gn3
-rw-r--r--llvm/utils/gn/secondary/clang/lib/Analysis/LifetimeSafety/BUILD.gn20
-rw-r--r--llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn1
-rw-r--r--llvm/utils/gn/secondary/clang/unittests/Basic/BUILD.gn1
-rw-r--r--llvm/utils/gn/secondary/clang/unittests/StaticAnalyzer/BUILD.gn1
-rw-r--r--llvm/utils/gn/secondary/lld/test/BUILD.gn7
-rw-r--r--llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn1
-rw-r--r--llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn1
-rw-r--r--llvm/utils/profcheck-xfail.txt90
-rw-r--r--[-rwxr-xr-x]llvm/utils/release/build_llvm_release.bat150
-rwxr-xr-xllvm/utils/unicode-case-fold.py6
-rw-r--r--llvm/utils/vim/syntax/llvm.vim2
948 files changed, 101599 insertions, 51999 deletions
diff --git a/llvm/benchmarks/CMakeLists.txt b/llvm/benchmarks/CMakeLists.txt
index 3cbfb0d..e411ed4 100644
--- a/llvm/benchmarks/CMakeLists.txt
+++ b/llvm/benchmarks/CMakeLists.txt
@@ -11,6 +11,7 @@ add_benchmark(FormatVariadicBM FormatVariadicBM.cpp PARTIAL_SOURCES_INTENDED)
add_benchmark(GetIntrinsicInfoTableEntriesBM GetIntrinsicInfoTableEntriesBM.cpp PARTIAL_SOURCES_INTENDED)
add_benchmark(SandboxIRBench SandboxIRBench.cpp PARTIAL_SOURCES_INTENDED)
add_benchmark(MustacheBench Mustache.cpp PARTIAL_SOURCES_INTENDED)
+add_benchmark(SpecialCaseListBM SpecialCaseListBM.cpp PARTIAL_SOURCES_INTENDED)
add_benchmark(RuntimeLibcallsBench RuntimeLibcalls.cpp PARTIAL_SOURCES_INTENDED)
diff --git a/llvm/benchmarks/SpecialCaseListBM.cpp b/llvm/benchmarks/SpecialCaseListBM.cpp
new file mode 100644
index 0000000..00aa3cd
--- /dev/null
+++ b/llvm/benchmarks/SpecialCaseListBM.cpp
@@ -0,0 +1,207 @@
+#include "benchmark/benchmark.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SpecialCaseList.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <iterator>
+#include <random>
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+namespace {
+constexpr int RNG_SEED = 123456;
+constexpr int MAX_LIST_MIN = 10;
+constexpr int MAX_LIST_MAX = 1000000;
+constexpr int MAX_LIST_MUL = 10;
+
+std::unique_ptr<SpecialCaseList> makeSpecialCaseList(StringRef List) {
+ std::string Error;
+ std::unique_ptr<MemoryBuffer> MB = MemoryBuffer::getMemBuffer(List);
+ auto SCL = SpecialCaseList::create(MB.get(), Error);
+ assert(SCL);
+ assert(Error == "");
+ return SCL;
+}
+
+static const std::string Dictionary[] = {
+ "orange", "tabby", "tortie", "tuxedo", "void",
+ "multiple", "spaces", "cute", "fluffy", "kittens",
+};
+
+std::vector<std::string> genFiles(size_t NumFiles) {
+ std::vector<std::string> R;
+ R.reserve(NumFiles);
+ std::minstd_rand Rng(RNG_SEED);
+ std::uniform_int_distribution<> DepthDistrib(8, 16);
+ std::uniform_int_distribution<> WordDistrib(0, std::size(Dictionary) - 1);
+
+ std::string S;
+ for (size_t I = 0; I < NumFiles; ++I) {
+ for (size_t D = DepthDistrib(Rng); D; --D) {
+ S += Dictionary[WordDistrib(Rng)];
+ if (D > 1)
+ S += "/";
+ }
+ R.push_back(std::move(S));
+ S.clear();
+ }
+ return R;
+}
+
+std::string genGlobNone(const std::vector<std::string> &Files) {
+ std::string S;
+ for (const auto &F : Files) {
+ S += "src:";
+ S += F;
+ S += "\n";
+ }
+ return S;
+}
+
+std::string genGlobInMid(const std::vector<std::string> &Files) {
+ std::string S;
+ std::minstd_rand Rng(RNG_SEED);
+ for (std::string F : Files) {
+ std::uniform_int_distribution<> PosDistrib(0, F.size() - 1);
+ F[PosDistrib(Rng)] = '*';
+ S += "src:";
+ S += F;
+ S += "\n";
+ }
+ return S;
+}
+
+std::string genGlobAtStart(const std::vector<std::string> &Files) {
+ std::string S;
+ for (std::string F : Files) {
+ F.front() = '*';
+ S += "src:";
+ S += F;
+ S += "\n";
+ }
+ return S;
+}
+
+std::string genGlobAtEnd(const std::vector<std::string> &Files) {
+ std::string S;
+ for (std::string F : Files) {
+ F.back() = '*';
+ S += "src:";
+ S += F;
+ S += "\n";
+ }
+ return S;
+}
+
+std::string genGlobAtBothSides(const std::vector<std::string> &Files) {
+ std::string S;
+ for (std::string F : Files) {
+ F.back() = '*';
+ F.front() = '*';
+ S += "src:";
+ S += F;
+ S += "\n";
+ }
+ return S;
+}
+
+void BM_Make_(
+ benchmark::State &state,
+ std::string (*GenerateCaseList)(const std::vector<std::string> &Files)) {
+ std::vector<std::string> BigFileList = genFiles(state.range(0));
+ std::string BigCaseList = GenerateCaseList(BigFileList);
+ for (auto _ : state) {
+ auto SCL = makeSpecialCaseList(BigCaseList);
+ benchmark::DoNotOptimize(SCL);
+ }
+}
+void BM_True_(
+ benchmark::State &state,
+ std::string (*GenerateCaseList)(const std::vector<std::string> &Files)) {
+ std::vector<std::string> BigFileList = genFiles(state.range(0));
+ std::string BigCaseList = GenerateCaseList(BigFileList);
+ auto SCL = makeSpecialCaseList(BigCaseList);
+ std::minstd_rand Rng(RNG_SEED);
+ std::uniform_int_distribution<> LineDistrib(0, BigFileList.size() - 1);
+ for (auto _ : state) {
+ auto &Q = BigFileList[LineDistrib(Rng)];
+ bool R = SCL->inSection("", "src", Q);
+ if (!R)
+ abort();
+ benchmark::DoNotOptimize(R);
+ }
+}
+
+void BM_False(
+ benchmark::State &state,
+ std::string (*GenerateCaseList)(const std::vector<std::string> &Files)) {
+ std::vector<std::string> BigFileList = genFiles(state.range(0));
+ std::string BigCaseList = GenerateCaseList(BigFileList);
+ auto SCL = makeSpecialCaseList(BigCaseList);
+ std::minstd_rand Rng(RNG_SEED);
+ std::uniform_int_distribution<> LineDistrib(0, BigFileList.size() - 1);
+ for (auto _ : state) {
+ std::string Q = BigFileList[LineDistrib(Rng)];
+ std::uniform_int_distribution<> PosDistrib(0, Q.size() - 1);
+ Q[PosDistrib(Rng)] = '_';
+ bool R = SCL->inSection("", "src", Q);
+ benchmark::DoNotOptimize(R);
+ }
+}
+
+} // namespace
+
+BENCHMARK_CAPTURE(BM_Make_, None_, genGlobNone)
+ ->RangeMultiplier(MAX_LIST_MUL)
+ ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
+BENCHMARK_CAPTURE(BM_Make_, Start, genGlobAtStart)
+ ->RangeMultiplier(MAX_LIST_MUL)
+ ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
+BENCHMARK_CAPTURE(BM_Make_, End__, genGlobAtEnd)
+ ->RangeMultiplier(MAX_LIST_MUL)
+ ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
+BENCHMARK_CAPTURE(BM_Make_, Mid__, genGlobInMid)
+ ->RangeMultiplier(MAX_LIST_MUL)
+ ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
+BENCHMARK_CAPTURE(BM_Make_, Both_, genGlobAtBothSides)
+ ->RangeMultiplier(MAX_LIST_MUL)
+ ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
+
+BENCHMARK_CAPTURE(BM_True_, None_, genGlobNone)
+ ->RangeMultiplier(MAX_LIST_MUL)
+ ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
+BENCHMARK_CAPTURE(BM_True_, Start, genGlobAtStart)
+ ->RangeMultiplier(MAX_LIST_MUL)
+ ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
+BENCHMARK_CAPTURE(BM_True_, End__, genGlobAtEnd)
+ ->RangeMultiplier(MAX_LIST_MUL)
+ ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
+BENCHMARK_CAPTURE(BM_True_, Mid__, genGlobInMid)
+ ->RangeMultiplier(MAX_LIST_MUL)
+ ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
+BENCHMARK_CAPTURE(BM_True_, Both_, genGlobAtBothSides)
+ ->RangeMultiplier(MAX_LIST_MUL)
+ ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
+
+BENCHMARK_CAPTURE(BM_False, None_, genGlobNone)
+ ->RangeMultiplier(MAX_LIST_MUL)
+ ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
+BENCHMARK_CAPTURE(BM_False, Start, genGlobAtStart)
+ ->RangeMultiplier(MAX_LIST_MUL)
+ ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
+BENCHMARK_CAPTURE(BM_False, End__, genGlobAtEnd)
+ ->RangeMultiplier(MAX_LIST_MUL)
+ ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
+BENCHMARK_CAPTURE(BM_False, Mid__, genGlobInMid)
+ ->RangeMultiplier(MAX_LIST_MUL)
+ ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
+BENCHMARK_CAPTURE(BM_False, Both_, genGlobAtBothSides)
+ ->RangeMultiplier(MAX_LIST_MUL)
+ ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
+
+BENCHMARK_MAIN();
diff --git a/llvm/docs/AArch64SME.rst b/llvm/docs/AArch64SME.rst
index 47ed7bc..327f9dc 100644
--- a/llvm/docs/AArch64SME.rst
+++ b/llvm/docs/AArch64SME.rst
@@ -124,7 +124,7 @@ In this table, we use the following abbreviations:
either 0 or 1 on entry, and is unchanged on return).
Functions with ``__attribute__((arm_locally_streaming))`` are excluded from this
-table because for the caller the attribute is synonymous to 'streaming', and
+table because for the caller the attribute is synonymous with 'streaming', and
for the callee it is merely an implementation detail that is explicitly not
exposed to the caller.
@@ -158,7 +158,7 @@ the function's body, so that it can place the mode changes in exactly the right
position. The suitable place to do this seems to be SelectionDAG, where it lowers
the call's arguments/return values to implement the specified calling convention.
SelectionDAG provides Chains and Glue to specify the order of operations and give
-preliminary control over the instruction's scheduling.
+preliminary control over instruction scheduling.
Example of preserving state
@@ -232,8 +232,8 @@ implement transitions from ``SC -> N`` and ``SC -> S``.
Unchained Function calls
------------------------
When a function with "``aarch64_pstate_sm_enabled``" calls a function that is not
-streaming compatible, the compiler has to insert a SMSTOP before the call and
-insert a SMSTOP after the call.
+streaming compatible, the compiler has to insert an SMSTOP before the call and
+insert an SMSTOP after the call.
If the function that is called is an intrinsic with no side-effects which in
turn is lowered to a function call (e.g., ``@llvm.cos()``), then the call to
@@ -388,7 +388,7 @@ The value of PSTATE.SM is not controlled by the feature flags, but rather by the
function attributes. This means that we can compile for '``+sme``', and the compiler
will code-generate any instructions, even if they are not legal under the requested
streaming mode. The compiler needs to use the function attributes to ensure the
-compiler doesn't do transformations under the assumption that certain operations
+compiler doesn't perform transformations under the assumption that certain operations
are available at runtime.
We made a conscious choice not to model this with feature flags because we
@@ -399,11 +399,11 @@ and `D121208 <https://reviews.llvm.org/D121208>`_) because of limitations in
TableGen.
As a first step, this means we'll disable vectorization (LoopVectorize/SLP)
-entirely when the a function has either of the ``aarch64_pstate_sm_enabled``,
+entirely when a function has either of the ``aarch64_pstate_sm_enabled``,
``aarch64_pstate_sm_body`` or ``aarch64_pstate_sm_compatible`` attributes,
in order to avoid the use of vector instructions.
-Later on we'll aim to relax these restrictions to enable scalable
+Later on, we'll aim to relax these restrictions to enable scalable
auto-vectorization with a subset of streaming-compatible instructions, but that
requires changes to the CostModel, Legalization and SelectionDAG lowering.
@@ -416,7 +416,7 @@ Other things to consider
------------------------
* Inlining must be disabled when the call-site needs to toggle PSTATE.SM or
- when the callee's function body is executed in a different streaming mode than
+ when the callee's function body is executed in a different streaming mode from
its caller. This is needed because function calls are the boundaries for
streaming mode changes.
@@ -434,8 +434,8 @@ lazy-save mechanism for calls to private-ZA functions (i.e. functions that may
either directly or indirectly clobber ZA state).
For the purpose of handling functions marked with ``aarch64_new_za``,
-we have introduced a new LLVM IR pass (SMEABIPass) that is run just before
-SelectionDAG. Any such functions dealt with by this pass are marked with
+we have introduced a new LLVM IR pass (SMEABIPass) that runs just before
+SelectionDAG. Any such functions handled by this pass are marked with
``aarch64_expanded_pstate_za``.
Setting up a lazy-save
@@ -458,7 +458,7 @@ AArch64 Predicate-as-Counter Type
The predicate-as-counter type represents the type of a predicate-as-counter
value held in an AArch64 SVE predicate register. Such a value contains
information about the number of active lanes, the element width and a bit that
-tells whether the generated mask should be inverted. ACLE intrinsics should be
+indicates whether the generated mask should be inverted. ACLE intrinsics should be
used to move the predicate-as-counter value to/from a predicate vector.
There are certain limitations on the type:
@@ -466,7 +466,7 @@ There are certain limitations on the type:
* The type can be used for function parameters and return values.
* The supported LLVM operations on this type are limited to ``load``, ``store``,
- ``phi``, ``select`` and ``alloca`` instructions.
+ ``phi``, ``select``, and ``alloca`` instructions.
The predicate-as-counter type is a scalable type.
diff --git a/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst b/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst
index ba670d3..f472b862 100644
--- a/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst
+++ b/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst
@@ -37,13 +37,13 @@ includes contributions to open source projects such as LLVM [:ref:`LLVM
The LLVM compiler has upstream support for commercially available AMD GPU
hardware (AMDGPU) [:ref:`AMDGPU-LLVM <amdgpu-dwarf-AMDGPU-LLVM>`]. The open
-source ROCgdb [:ref:`AMD-ROCgdb <amdgpu-dwarf-AMD-ROCgdb>`] GDB based debugger
+source ROCgdb [:ref:`AMD-ROCgdb <amdgpu-dwarf-AMD-ROCgdb>`] GDB-based debugger
also has support for AMDGPU which is being upstreamed. Support for AMDGPU is
also being added by third parties to the GCC [:ref:`GCC <amdgpu-dwarf-GCC>`]
compiler and the Perforce TotalView HPC Debugger [:ref:`Perforce-TotalView
<amdgpu-dwarf-Perforce-TotalView>`].
-To support debugging heterogeneous programs several features that are not
+To support debugging heterogeneous programs, several features that are not
provided by current DWARF Version 5 [:ref:`DWARF <amdgpu-dwarf-DWARF>`] have
been identified. The :ref:`amdgpu-dwarf-extensions` section gives an overview of
the extensions devised to address the missing features. The extensions seek to
@@ -107,7 +107,7 @@ for each in terms of heterogeneous debugging.
DWARF Version 5 does not allow location descriptions to be entries on the DWARF
expression stack. They can only be the final result of the evaluation of a DWARF
expression. However, by allowing a location description to be a first-class
-entry on the DWARF expression stack it becomes possible to compose expressions
+entry on the DWARF expression stack, it becomes possible to compose expressions
containing both values and location descriptions naturally. It allows objects to
be located in any kind of memory address space, in registers, be implicit
values, be undefined, or a composite of any of these.
@@ -123,20 +123,20 @@ non-default address spaces and generalizing the power of composite location
descriptions to any kind of location description.
For those familiar with the definition of location descriptions in DWARF Version
-5, the definitions in these extensions are presented differently, but does in
+5, the definitions in these extensions are presented differently, but do in
fact define the same concept with the same fundamental semantics. However, it
does so in a way that allows the concept to extend to support address spaces,
bit addressing, the ability for composite location descriptions to be composed
of any kind of location description, and the ability to support objects located
at multiple places. Collectively these changes expand the set of architectures
-that can be supported and improves support for optimized code.
+that can be supported and improve support for optimized code.
Several approaches were considered, and the one presented, together with the
extensions it enables, appears to be the simplest and cleanest one that offers
the greatest improvement of DWARF's ability to support debugging optimized GPU
and non-GPU code. Examining the GDB debugger and LLVM compiler, it appears only
to require modest changes as they both already have to support general use of
-location descriptions. It is anticipated that will also be the case for other
+location descriptions. It is anticipated that this will also be the case for other
debuggers and compilers.
GDB has been modified to evaluate DWARF Version 5 expressions with location
@@ -156,7 +156,7 @@ DWARF Expression Stack* [:ref:`AMDGPU-DWARF-LOC
2.2 Generalize CFI to Allow Any Location Description Kind
---------------------------------------------------------
-CFI describes restoring callee saved registers that are spilled. Currently CFI
+CFI describes restoring callee saved registers that are spilled. Currently, CFI
only allows a location description that is a register, memory address, or
implicit location description. AMDGPU optimized code may spill scalar registers
into portions of vector registers. This requires extending CFI to allow any
@@ -223,7 +223,7 @@ infinite precision offsets to allow it to correctly track a series of positive
and negative offsets that may transiently overflow or underflow, but end up in
range. This is simple for the arithmetic operations as they are defined in terms
of two's complement arithmetic on a base type of a fixed size. Therefore, the
-offset operation define that integer overflow is ill-formed. This is in contrast
+offset operation defines that integer overflow is ill-formed. This is in contrast
to the ``DW_OP_plus``, ``DW_OP_plus_uconst``, and ``DW_OP_minus`` arithmetic
operations which define that it causes wrap-around.
@@ -359,7 +359,7 @@ address space at a fixed address.
The ``DW_OP_LLVM_form_aspace_address`` (see
:ref:`amdgpu-dwarf-memory-location-description-operations`) operation is defined
-to create a memory location description from an address and address space. If
+to create a memory location description from an address and address space. It
can be used to specify the location of a variable that is allocated in a
specific address space. This allows the size of addresses in an address space to
be larger than the generic type. It also allows a consumer great implementation
@@ -372,7 +372,7 @@ In contrast, if the ``DW_OP_LLVM_form_aspace_address`` operation had been
defined to produce a value, and an implicit conversion to a memory location
description was defined, then it would be limited to the size of the generic
type (which matches the size of the default address space). An implementation
-would likely have to use *reserved ranges* of value to represent different
+would likely have to use *reserved ranges* of values to represent different
address spaces. Such a value would likely not match any address value in the
actual hardware. That would require the consumer to have special treatment for
such values.
@@ -528,7 +528,7 @@ active. To describe the conceptual location of non-active lanes requires an
attribute that has an expression that computes the source location PC for each
lane.
-For efficiency, the expression calculates the source location the wavefront as a
+For efficiency, the expression calculates the source location of the wavefront as a
whole. This can be done using the ``DW_OP_LLVM_select_bit_piece`` (see
:ref:`amdgpu-dwarf-operation-to-create-vector-composite-location-descriptions`)
operation.
@@ -564,7 +564,7 @@ information entry to indicate that there is additional target architecture
specific information in the debugging information entries of that compilation
unit. This allows a consumer to know what extensions are present in the debugger
information entries as is possible with the augmentation string of other
-sections. See .
+sections.
The format that should be used for an augmentation string is also recommended.
This allows a consumer to parse the string when it contains information from
@@ -581,7 +581,7 @@ See :ref:`amdgpu-dwarf-full-and-partial-compilation-unit-entries`,
AMDGPU supports programming languages that include online compilation where the
source text may be created at runtime. For example, the OpenCL and HIP language
-runtimes support online compilation. To support is, a way to embed the source
+runtimes support online compilation. To support this, a way to embed the source
text in the debug information is provided.
See :ref:`amdgpu-dwarf-line-number-information`.
@@ -589,16 +589,16 @@ See :ref:`amdgpu-dwarf-line-number-information`.
2.17 Allow MD5 Checksums to be Optionally Present
-------------------------------------------------
-In DWARF Version 5 the file timestamp and file size can be optional, but if the
-MD5 checksum is present it must be valid for all files. This is a problem if
+In DWARF Version 5, the file timestamp and file size can be optional, but if the
+MD5 checksum is present, it must be valid for all files. This is a problem if
using link time optimization to combine compilation units where some have MD5
-checksums and some do not. Therefore, sSupport to allow MD5 checksums to be
-optionally present in the line table is added.
+checksums, and others do not. Therefore, the line table is extended to allow MD5
+checksums to be optional.
See :ref:`amdgpu-dwarf-line-number-information`.
-2.18 Add the HIP Programing Language
-------------------------------------
+2.18 Add the HIP Programming Language
+-------------------------------------
The HIP programming language [:ref:`HIP <amdgpu-dwarf-HIP>`], which is supported
by the AMDGPU, is added.
@@ -617,7 +617,7 @@ hardware to allow a single instruction to execute multiple iterations using
vector registers.
Note that although this is similar to SIMT execution, the way a client debugger
-uses the information is fundamentally different. In SIMT execution the debugger
+uses the information is fundamentally different. In SIMT execution, the debugger
needs to present the concurrent execution as distinct source language threads
that the user can list and switch focus between. With iteration concurrency
optimizations, such as software pipelining and vectorized SIMD, the debugger
@@ -648,7 +648,7 @@ language loop iterations are executing concurrently. See
It is common in SIMD vectorization for the compiler to generate code that
promotes portions of an array into vector registers. For example, if the
hardware has vector registers with 8 elements, and 8 wide SIMD instructions, the
-compiler may vectorize a loop so that is executes 8 iterations concurrently for
+compiler may vectorize a loop so that it executes 8 iterations concurrently for
each vectorized loop iteration.
On the first iteration of the generated vectorized loop, iterations 0 to 7 of
@@ -691,7 +691,7 @@ Inside the loop body, the machine code loads ``src[i]`` and ``dst[i]`` into
registers, adds them, and stores the result back into ``dst[i]``.
Considering the location of ``dst`` and ``src`` in the loop body, the elements
-``dst[i]`` and ``src[i]`` would be located in registers, all other elements are
+``dst[i]`` and ``src[i]`` would be located in registers; all other elements are
located in memory. Let register ``R0`` contain the base address of ``dst``,
register ``R1`` contain ``i``, and register ``R2`` contain the registerized
``dst[i]`` element. We can describe the location of ``dst`` as a memory location
@@ -722,7 +722,7 @@ with a register location overlaid at a runtime offset involving ``i``:
----------------------------------------------
AMDGPU supports languages, such as OpenCL, that define source language memory
-spaces. Support is added to define language specific memory spaces so they can
+spaces. Support is added to define language-specific memory spaces so they can
be used in a consistent way by consumers. See :ref:`amdgpu-dwarf-memory-spaces`.
A new attribute ``DW_AT_LLVM_memory_space`` is added to support using memory
@@ -738,9 +738,9 @@ accommodates only 32 unique operations. In practice, the lack of a central
registry and a desire for backwards compatibility means vendor extensions are
never retired, even when standard versions are accepted into DWARF proper. This
has produced a situation where the effective encoding space available for new
-vendor extensions is miniscule today.
+vendor extensions is minuscule today.
-To expand this encoding space a new DWARF operation ``DW_OP_LLVM_user`` is
+To expand this encoding space, a new DWARF operation ``DW_OP_LLVM_user`` is
added which acts as a "prefix" for vendor extensions. It is followed by a
ULEB128 encoded vendor extension opcode, which is then followed by the operands
of the corresponding vendor extension operation.
@@ -776,7 +776,7 @@ A. Changes Relative to DWARF Version 5
.. note::
Notes are included to describe how the changes are to be applied to the
- DWARF Version 5 standard. They also describe rational and issues that may
+ DWARF Version 5 standard. They also describe rationale and issues that may
need further consideration.
A.2 General Description
@@ -898,7 +898,7 @@ elements that can be specified are:
*A current lane*
- The 0 based SIMT lane identifier to be used in evaluating a user presented
+ The 0-based SIMT lane identifier to be used in evaluating a user presented
expression. This applies to source languages that are implemented for a target
architecture using a SIMT execution model. These implementations map source
language threads of execution to lanes of the target architecture threads.
@@ -917,7 +917,7 @@ elements that can be specified are:
*A current iteration*
- The 0 based source language iteration instance to be used in evaluating a user
+ The 0-based source language iteration instance to be used in evaluating a user
presented expression. This applies to target architectures that support
optimizations that result in executing multiple source language loop iterations
concurrently.
@@ -1845,7 +1845,7 @@ There are these special value operations currently defined:
interpreted as a value of T. If a conversion is wanted it can be done
explicitly using a ``DW_OP_convert`` operation.
- GDB has a per register hook that allows a target specific conversion on a
+ GDB has a per register hook that allows a target-specific conversion on a
register by register basis. It defaults to truncation of bigger registers.
Removing use of the target hook does not cause any test failures in common
architectures. If the compiler for a target architecture did want some
@@ -1855,7 +1855,7 @@ There are these special value operations currently defined:
If T is a larger type than the register size, then the default GDB
register hook reads bytes from the next register (or reads out of bounds
for the last register!). Removing use of the target hook does not cause
- any test failures in common architectures (except an illegal hand written
+ any test failures in common architectures (except an illegal hand-written
assembly test). If a target architecture requires this behavior, these
extensions allow a composite location description to be used to combine
multiple registers.
@@ -2283,7 +2283,7 @@ bit offset equal to V scaled by 8 (the byte size).
The implicit conversion could also be defined as target architecture specific.
For example, GDB checks if V is an integral type. If it is not it gives an
error. Otherwise, GDB zero-extends V to 64 bits. If the GDB target defines a
- hook function, then it is called. The target specific hook function can modify
+ hook function, then it is called. The target-specific hook function can modify
the 64-bit value, possibly sign extending based on the original value type.
Finally, GDB treats the 64-bit value V as a memory location address.
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index a4d110f..e062032 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -488,21 +488,21 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following
**GCN GFX11 (RDNA 3.5)** [AMD-GCN-GFX11-RDNA3.5]_
-----------------------------------------------------------------------------------------------------------------------
- ``gfx1150`` ``amdgcn`` APU - cumode - Architected *TBA*
+ ``gfx1150`` ``amdgcn`` APU - cumode - Architected Radeon 890M
- wavefrontsize64 flat
scratch .. TODO::
- Packed
work-item Add product
IDs names.
- ``gfx1151`` ``amdgcn`` APU - cumode - Architected *TBA*
+ ``gfx1151`` ``amdgcn`` APU - cumode - Architected Radeon 8060S
- wavefrontsize64 flat
scratch .. TODO::
- Packed
work-item Add product
IDs names.
- ``gfx1152`` ``amdgcn`` APU - cumode - Architected *TBA*
+ ``gfx1152`` ``amdgcn`` APU - cumode - Architected Radeon 860M
- wavefrontsize64 flat
scratch .. TODO::
- Packed
@@ -883,6 +883,8 @@ supported for the ``amdgcn`` target.
Buffer Fat Pointer 7 N/A N/A 160 0
Buffer Resource 8 N/A V# 128 0x00000000000000000000000000000000
Buffer Strided Pointer (experimental) 9 *TODO*
+ *reserved for downstream use* 10
+ *reserved for downstream use* 11
Streamout Registers 128 N/A GS_REGS
===================================== =============== =========== ================ ======= ============================
@@ -4172,7 +4174,7 @@ non-AMD key names should be prefixed by "*vendor-name*.".
"Image", or "Pipe". This may be
more restrictive than indicated
by "AccQual" to reflect what the
- kernel actual does. If not
+ kernel actually does. If not
present then the runtime must
assume what is implied by
"AccQual" and "IsConst". Values
@@ -5436,8 +5438,8 @@ The fields used by CP for code objects before V3 also match those specified in
``COMPUTE_PGM_RSRC1.PRIORITY``.
13:12 2 bits FLOAT_ROUND_MODE_32 Wavefront starts execution
with specified rounding
- mode for single (32
- bit) floating point
+ mode for single (32-bit)
+ floating point
precision floating point
operations.
@@ -5769,7 +5771,7 @@ The fields used by CP for code objects before V3 also match those specified in
Wavefront starts execution
with memory violation
- exceptions exceptions
+ exceptions
enabled which are generated
when a memory violation has
occurred for this wavefront from
@@ -6005,7 +6007,7 @@ The fields used by CP for code objects before V3 also match those specified in
FLOAT_DENORM_MODE_FLUSH_NONE 3 No Flush
====================================== ===== ====================================
- Denormal flushing is sign respecting. i.e. the behavior expected by
+ Denormal flushing is sign respecting, i.e., the behavior expected by
``"denormal-fp-math"="preserve-sign"``. The behavior is undefined with
``"denormal-fp-math"="positive-zero"``
@@ -16831,7 +16833,7 @@ For GFX125x:
* Some memory operations contain a ``nv`` bit, for "non-volatile", which indicates
memory that is not expected to change during a kernel's execution.
This information is propagated to the cache lines for that address
- (refered to as ``$nv``).
+ (referred to as ``$nv``).
* When ``nv=0`` reads hit dirty ``$nv=1`` data in cache, the hardware will
writeback the data to the next level in the hierarchy and then subsequently read
@@ -18970,7 +18972,7 @@ On entry to a function:
#. All other registers are unspecified.
#. Any necessary ``s_waitcnt`` has been performed to ensure memory is available
to the function.
-#. Use pass-by-reference (byref) in stead of pass-by-value (byval) for struct
+#. Use pass-by-reference (byref) instead of pass-by-value (byval) for struct
arguments in C ABI. Callee is responsible for allocating stack memory and
copying the value of the struct if modified. Note that the backend still
supports byval for struct arguments.
@@ -20214,7 +20216,7 @@ from the value of the ``-mcpu`` option that is passed to the assembler.
.amdgpu_hsa_kernel (name)
+++++++++++++++++++++++++
-This directives specifies that the symbol with given name is a kernel entry
+This directive specifies that the symbol with given name is a kernel entry
point (label) and the object should contain corresponding symbol of type
STT_AMDGPU_HSA_KERNEL.
diff --git a/llvm/docs/CMakeLists.txt b/llvm/docs/CMakeLists.txt
index b4522e3..fc37c6d 100644
--- a/llvm/docs/CMakeLists.txt
+++ b/llvm/docs/CMakeLists.txt
@@ -136,17 +136,23 @@ if( NOT uses_ocaml LESS 0 AND LLVM_ENABLE_OCAMLDOC )
list(APPEND odoc_files -load ${odoc_file})
endforeach()
- add_custom_target(ocaml_doc
- COMMAND ${CMAKE_COMMAND} -E remove_directory ${CMAKE_CURRENT_BINARY_DIR}/ocamldoc/html
- COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/ocamldoc/html
- COMMAND ${OCAMLFIND} ocamldoc -d ${CMAKE_CURRENT_BINARY_DIR}/ocamldoc/html
- -sort -colorize-code -html ${odoc_files}
- COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/_ocamldoc/style.css
- ${CMAKE_CURRENT_BINARY_DIR}/ocamldoc/html)
+ set(OCAML_DOC_ADD_TO_ALL "")
+ if(LLVM_BUILD_DOCS)
+ set(OCAML_DOC_ADD_TO_ALL ALL)
+ endif()
+
+ add_custom_target(ocaml_doc ${OCAML_DOC_ADD_TO_ALL}
+ COMMAND ${CMAKE_COMMAND} -E remove_directory ${CMAKE_CURRENT_BINARY_DIR}/ocamldoc/html
+ COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/ocamldoc/html
+ COMMAND ${OCAMLFIND} ocamldoc -d ${CMAKE_CURRENT_BINARY_DIR}/ocamldoc/html
+ -sort -colorize-code -html ${odoc_files}
+ COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/_ocamldoc/style.css
+ ${CMAKE_CURRENT_BINARY_DIR}/ocamldoc/html)
add_dependencies(ocaml_doc ${doc_targets})
- if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
+
+ if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY AND LLVM_BUILD_DOCS)
# ./ suffix is needed to copy the contents of html directory without
# appending html/ into LLVM_INSTALL_OCAMLDOC_HTML_DIR.
install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/ocamldoc/html/.
diff --git a/llvm/docs/CallGraphSection.md b/llvm/docs/CallGraphSection.md
index 8b18727..84d6061 100644
--- a/llvm/docs/CallGraphSection.md
+++ b/llvm/docs/CallGraphSection.md
@@ -1,10 +1,10 @@
-# .callgraph Section Layout
+# .llvm.callgraph Section Layout
-The `.callgraph` section is used to store call graph information for each function. The section contains a series of records, with each record corresponding to a single function.
+The `.llvm.callgraph` section is used to store call graph information for each function. The section contains a series of records, with each record corresponding to a single function.
## Per Function Record Layout
-Each record in the `.callgraph` section has the following binary layout:
+Each record in the `.llvm.callgraph` section has the following binary layout:
| Field | Type | Size (bits) | Description |
| -------------------------------------- | ------------- | ----------- | ------------------------------------------------------------------------------------------------------- |
diff --git a/llvm/docs/CodeOfConduct.rst b/llvm/docs/CodeOfConduct.rst
index 645ae12..995d32b 100644
--- a/llvm/docs/CodeOfConduct.rst
+++ b/llvm/docs/CodeOfConduct.rst
@@ -171,6 +171,7 @@ The current committee members are:
Transparency Reports
====================
+* `July 15, 2025 <https://discourse.llvm.org/t/llvm-code-of-conduct-transparency-report-july-15-2024-july-15-2025/88622>`_
* `July 15, 2024 <https://discourse.llvm.org/t/llvm-code-of-conduct-transparency-report-july-15-2023-july-15-2024/82687>`_
* `July 15, 2023 <https://llvm.org/coc-reports/2023-07-15-report.html>`_
* `July 15, 2022 <https://llvm.org/coc-reports/2022-07-15-report.html>`_
diff --git a/llvm/docs/DirectX/DXILResources.rst b/llvm/docs/DirectX/DXILResources.rst
index 91dcd5c8..f253e02f 100644
--- a/llvm/docs/DirectX/DXILResources.rst
+++ b/llvm/docs/DirectX/DXILResources.rst
@@ -746,3 +746,92 @@ Examples:
@llvm.dx.resource.load.cbufferrow.8(
target("dx.CBuffer", target("dx.Layout", {i16}, 2, 0)) %buffer,
i32 %index)
+
+Resource dimensions
+-------------------
+
+*relevant types: Textures and Buffer*
+
+The `getDimensions`_ DXIL operation returns the dimensions of a texture or
+buffer resource. It returns a `Dimensions`_ type, which is a struct
+containing four ``i32`` values. The values in the struct represent the size
+of each dimension of the resource, and when aplicable the number of array
+elements or number of samples. The mapping is defined in the
+`getDimensions`_ documentation.
+
+The LLVM IR representation of this operation has several forms
+depending on the resource type and the specific ``getDimensions`` query.
+The intrinsics return a scalar or anonymous struct with up to 4 `i32`
+elements. The intrinsic names include suffixes to indicate the number of
+elements in the return value. The suffix `.x` indicates a single `i32`
+return value, `.xy` indicates a struct with two `i32` values, and `.xyz`
+indicates a struct with three `i32` values.
+
+Intrinsics representing queries on multisampled texture resources include
+`.ms.` in their name and their return value includes an additional `i32` for
+the number of samples.
+
+Intrinsics with `mip_level` argument and `.levels.` in their name are used
+for texture resources with multiple MIP levels. Their return
+struct includes an additional `i32` for the number of levels the resource has.
+
+.. code-block:: llvm
+
+ i32 @llvm.dx.resource.getdimensions.x( target("dx.*") handle )
+ {i32, i32} @llvm.dx.resource.getdimensions.xy( target("dx.*") handle )
+ {i32, i32, i32} @llvm.dx.resource.getdimensions.xyz( target("dx.*") handle )
+ {i32, i32} @llvm.dx.resource.getdimensions.levels.x( target("dx.*") handle, i32 mip_level )
+ {i32, i32, i32} @llvm.dx.resource.getdimensions.levels.xy( target("dx.*") handle, i32 mip_level )
+ {i32, i32, i32, i32} @llvm.dx.resource.getdimensions.levels.xyz( target("dx.*") handle, i32 mip_level )
+ {i32, i32, i32} @llvm.dx.resource.getdimensions.ms.xy( target("dx.*") handle )
+ {i32, i32, i32, i32} @llvm.dx.resource.getdimensions.ms.xyz( target("dx.*") handle )
+
+.. list-table:: ``@llvm.dx.resource.getdimensions.*``
+ :header-rows: 1
+
+ * - Argument
+ -
+ - Type
+ - Description
+ * - Return value
+ -
+ - `i32`, `{i32, i32}`, `{i32, i32, i32}`, or `{i32, i32, i32, i32}`
+ - Width, height, and depth of the resource (based on the specific suffix), and a number of levels or samples where aplicable.
+ * - ``%handle``
+ - 0
+ - ``target(dx.*)``
+ - Resource handle
+ * - ``%mip_level``
+ - 1
+ - ``i32``
+ - MIP level for the requested dimensions.
+
+Examples:
+
+.. code-block:: llvm
+
+ ; RWBuffer<float4>
+ %dim = call i32 @llvm.dx.resource.getdimensions.x(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %handle)
+
+ ; Texture2D
+ %0 = call {i32, i32} @llvm.dx.resource.getdimensions.xy(target("dx.Texture", ...) %tex2d)
+ %tex2d_width = extractvalue {i32, i32} %0, 0
+ %tex2d_height = extractvalue {i32, i32} %0, 1
+
+ ; Texture2DArray with levels
+ %1 = call {i32, i32, i32, i32} @llvm.dx.resource.getdimensions.levels.xyz(
+ target("dx.Texture", ...) %tex2darray, i32 1)
+ %tex2darray_width = extractvalue {i32, i32, i32, i32} %1, 0
+ %tex2darray_height = extractvalue {i32, i32, i32, i32} %1, 1
+ %tex2darray_elem_count = extractvalue {i32, i32, i32, i32} %1, 2
+ %tex2darray_levels_count = extractvalue {i32, i32, i32, i32} %1, 3
+
+ ; Texture2DMS
+ %2 = call {i32, i32, i32} @llvm.dx.resource.getdimensions.ms.xy(
+ target("dx.Texture", ...) %tex2dms)
+ %tex2dms_width = extractvalue {i32, i32, i32} %2, 0
+ %tex2dms_height = extractvalue {i32, i32, i32} %2, 1
+ %tex2dms_samples_count = extractvalue {i32, i32, i32} %2, 2
+
+.. _Dimensions: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#resource-operation-return-types
+.. _getDimensions: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#getdimensions
diff --git a/llvm/docs/GettingStartedVS.rst b/llvm/docs/GettingStartedVS.rst
index bc5746d..e65fd8f 100644
--- a/llvm/docs/GettingStartedVS.rst
+++ b/llvm/docs/GettingStartedVS.rst
@@ -126,6 +126,15 @@ These instructions were tested with Visual Studio 2019 and Python 3.9.6:
cmake -S llvm\llvm -B build -DLLVM_ENABLE_PROJECTS=clang -DLLVM_TARGETS_TO_BUILD=X86 -Thost=x64
exit
+ .. note::
+ By default, the Visual Studio project files generated by CMake use the
+ 32-bit toolset. If you are developing on a 64-bit version of Windows and
+ want to use the 64-bit toolset, pass the ``-Thost=x64`` flag when
+ generating the Visual Studio solution. This requires CMake 3.8.0 or later.
+
+ For Windows on Arm the equivalent is ``-Thost=ARM64``, but this the default
+ for those hosts, so you do not have to use this option.
+
``LLVM_ENABLE_PROJECTS`` specifies any additional LLVM projects you want to
build while ``LLVM_TARGETS_TO_BUILD`` selects the compiler targets. If
``LLVM_TARGETS_TO_BUILD`` is omitted by default all targets are built
@@ -149,10 +158,6 @@ These instructions were tested with Visual Studio 2019 and Python 3.9.6:
* CMake generates project files for all build types. To select a specific
build type, use the Configuration manager from the VS IDE or the
``/property:Configuration`` command-line option when using MSBuild.
- * By default, the Visual Studio project files generated by CMake use the
- 32-bit toolset. If you are developing on a 64-bit version of Windows and
- want to use the 64-bit toolset, pass the ``-Thost=x64`` flag when
- generating the Visual Studio solution. This requires CMake 3.8.0 or later.
13. Start Visual Studio and select configuration:
diff --git a/llvm/docs/HowToBuildOnARM.rst b/llvm/docs/HowToBuildOnARM.rst
index 9eb6b5a..30e3744 100644
--- a/llvm/docs/HowToBuildOnARM.rst
+++ b/llvm/docs/HowToBuildOnARM.rst
@@ -23,10 +23,10 @@ on the ARMv6 and ARMv7 architectures and may be inapplicable to older chips.
choices when using CMake. Autoconf usage is deprecated as of 3.8.
Building LLVM/Clang in ``Release`` mode is preferred since it consumes
- a lot less memory. Otherwise, the building process will very likely
+ a lot less memory. Otherwise, the build process will very likely
fail due to insufficient memory. It's also a lot quicker to only build
the relevant back-ends (ARM and AArch64), since it's very unlikely that
- you'll use an ARM board to cross-compile to other arches. If you're
+ you'll use an ARM board to cross-compile to other architectures. If you're
running Compiler-RT tests, also include the x86 back-end, or some tests
will fail.
@@ -48,15 +48,15 @@ on the ARMv6 and ARMv7 architectures and may be inapplicable to older chips.
``make -jN check-all`` or ``ninja check-all`` will run all compiler tests. For
running the test suite, please refer to :doc:`TestingGuide`.
-#. If you are building LLVM/Clang on an ARM board with 1G of memory or less,
- please use ``gold`` rather then GNU ``ld``. In any case it is probably a good
+#. If you are building LLVM/Clang on an ARM board with 1 GB of memory or less,
+ please use ``gold`` rather than GNU ``ld``. In any case, it is probably a good
idea to set up a swap partition, too.
.. code-block:: bash
$ sudo ln -sf /usr/bin/ld /usr/bin/ld.gold
-#. ARM development boards can be unstable and you may experience that cores
+#. ARM development boards can be unstable, and you may experience that cores
are disappearing, caches being flushed on every big.LITTLE switch, and
other similar issues. To help ease the effect of this, set the Linux
scheduler to "performance" on **all** cores using this little script:
@@ -73,12 +73,12 @@ on the ARMv6 and ARMv7 architectures and may be inapplicable to older chips.
problems.
#. Running the build on SD cards is ok, but they are more prone to failures
- than good quality USB sticks, and those are more prone to failures than
- external hard-drives (those are also a lot faster). So, at least, you
+ than good-quality USB sticks, and those are more prone to failures than
+ external hard drives (those are also a lot faster). So, at least, you
should consider to buy a fast USB stick. On systems with a fast eMMC,
that's a good option too.
#. Make sure you have a decent power supply (dozens of dollars worth) that can
- provide *at least* 4 amperes, this is especially important if you use USB
- devices with your board. Externally powered USB/SATA harddrives are even
+ provide *at least* 4 amperes. This is especially important if you use USB
+ devices with your board. Externally powered USB/SATA hard drives are even
better than having a good power supply.
diff --git a/llvm/docs/HowToReleaseLLVM.rst b/llvm/docs/HowToReleaseLLVM.rst
index 1795d3a..171bf88 100644
--- a/llvm/docs/HowToReleaseLLVM.rst
+++ b/llvm/docs/HowToReleaseLLVM.rst
@@ -18,11 +18,11 @@ create the binary packages, please refer to the :doc:`ReleaseProcess` instead.
Release Timeline
================
-LLVM is released on a time based schedule --- with major releases roughly
+LLVM is released on a time-based schedule --- with major releases roughly
every 6 months. In between major releases there may be dot releases.
The release manager will determine if and when to make a dot release based
on feedback from the community. Typically, dot releases should be made if
-there are large number of bug-fixes in the stable branch or a critical bug
+there are a large number of bug fixes in the stable branch or a critical bug
has been discovered that affects a large number of users.
Unless otherwise stated, dot releases will follow the same procedure as
@@ -73,7 +73,7 @@ Release Process Summary
* Generate and send out the second release candidate sources. Only *critical*
bugs found during this testing phase will be fixed. Any bugs introduced by
- merged patches will be fixed. If so a third round of testing is needed.
+ merged patches will be fixed. If so, a third round of testing is needed.
* The release notes are updated.
@@ -107,15 +107,15 @@ Create Release Branch and Update LLVM Version
Branch the Git trunk using the following procedure:
#. Remind developers that the release branching is imminent and to refrain from
- committing patches that might break the build. E.g., new features, large
+ committing patches that might break the build, e.g., new features, large
patches for works in progress, an overhaul of the type system, an exciting
new TableGen feature, etc.
#. Verify that the current git trunk is in decent shape by
examining nightly tester and buildbot results.
-#. Bump the version in trunk to N.0.0git with the script in
- ``llvm/utils/release/bump-version.py``, and tag the commit with llvmorg-N-init.
+#. Bump the version in trunk to ``N.0.0git`` with the script in
+ ``llvm/utils/release/bump-version.py``, and tag the commit with ``llvmorg-N-init``.
If ``X`` is the version to be released, then ``N`` is ``X + 1``. ::
$ git tag -sa llvmorg-N-init
@@ -124,14 +124,14 @@ Branch the Git trunk using the following procedure:
``llvm/utils/release/clear-release-notes.py``.
#. Create the release branch from the last known good revision from before the
- version bump. The branch's name is release/X.x where ``X`` is the major version
+ version bump. The branch's name is ``release/X.x`` where ``X`` is the major version
number and ``x`` is just the letter ``x``.
#. On the newly-created release branch, immediately bump the version
- to X.1.0git (where ``X`` is the major version of the branch.)
+ to ``X.1.0git`` (where ``X`` is the major version of the branch.)
-#. All tags and branches need to be created in both the llvm/llvm-project and
- llvm/llvm-test-suite repos.
+#. All tags and branches need to be created in both the ``llvm/llvm-project`` and
+ ``llvm/llvm-test-suite`` repos.
Tagging the LLVM Release Candidates
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -157,7 +157,7 @@ the release page.
$ for f in *.xz; do gh attestation verify --owner llvm $f && gpg -b $f; done
Tarballs, release binaries, or any other release artifacts must be uploaded to
-GitHub. This can be done using the github-upload-release.py script in utils/release.
+GitHub. This can be done using the ``github-upload-release.py`` script in ``utils/release``.
::
@@ -170,10 +170,10 @@ Build The Binary Distribution
Creating the binary distribution requires following the instructions
:doc:`here <ReleaseProcess>`.
-That process will perform both Release+Asserts and Release builds but only
-pack the Release build for upload. You should use the Release+Asserts sysroot,
+That process performs both Release+Asserts and Release builds but only packs
+the Release build for upload. You should use the Release+Asserts sysroot,
normally under ``final/Phase3/Release+Asserts/llvmCore-3.8.1-RCn.install/``,
-for test-suite and run-time benchmarks, to make sure nothing serious has
+for test-suite and run-time benchmarks, to ensure nothing serious has
passed through the net. For compile-time benchmarks, use the Release version.
The minimum required version of the tools you'll need are :doc:`here <GettingStarted>`
@@ -181,14 +181,14 @@ The minimum required version of the tools you'll need are :doc:`here <GettingSta
Release Qualification Criteria
------------------------------
-There are no official release qualification criteria. It is up to the
-the release manager to determine when a release is ready. The release manager
+There are no official release qualification criteria.
+The release manager determines when a release is ready. The release manager
should pay attention to the results of community testing, the number of outstanding
-bugs, and then number of regressions when determining whether or not to make a
+bugs, and the number of regressions when determining whether or not to make a
release.
The community values time based releases, so releases should not be delayed for
-too long unless there are critical issues remaining. In most cases, the only
+too long unless critical issues remain. In most cases, the only
kind of bugs that are critical enough to block a release would be a major regression
from a previous release.
@@ -199,33 +199,33 @@ A few developers in the community have dedicated time to validate the release
candidates and volunteered to be the official release testers for each
architecture.
-These will be the ones testing, generating and uploading the official binaries
+These will be the ones testing, generating, and uploading the official binaries
to the server, and will be the minimum tests *necessary* for the release to
proceed.
This will obviously not cover all OSs and distributions, so additional community
-validation is important. However, if community input is not reached before the
-release is out, all bugs reported will have to go on the next stable release.
+validation is important. However, if community input is not received before the
+release, all reported bugs will be deferred to the next stable release.
The official release managers are:
* Even releases: Tom Stellard (tstellar@redhat.com)
* Odd releases: Tobias Hieta (tobias@hieta.se)
-The official release testers are volunteered from the community and have
+The official release testers are volunteers from the community who have
consistently validated and released binaries for their targets/OSs. To contact
them, you should post on the `Discourse forums (Project
Infrastructure - Release Testers). <https://discourse.llvm.org/c/infrastructure/release-testers/66>`_
-The official testers list is in the file `RELEASE_TESTERS.TXT
+The official testers list is in the file ``RELEASE_TESTERS.TXT``
<https://github.com/llvm/llvm-project/blob/main/llvm/RELEASE_TESTERS.TXT>`_, in
the LLVM repository.
Community Testing
-----------------
-Once all testing has been completed and appropriate bugs filed, the release
-candidate tarballs are put on the website and the LLVM community is notified.
+Once all testing is complete and appropriate bugs are filed, the release
+candidate tarballs are put on the website, and the LLVM community is notified.
We ask that all LLVM developers test the release in any the following ways:
@@ -251,7 +251,7 @@ We ask that all LLVM developers test the release in any the following ways:
architecture.
We also ask that the OS distribution release managers test their packages with
-the first candidate of every release, and report any *new* errors in GitHub.
+the first candidate of every release and report any *new* errors in GitHub.
If the bug can be reproduced with an unpatched upstream version of the release
candidate (as opposed to the distribution's own build), the priority should be
release blocker.
@@ -268,10 +268,10 @@ next stage.
Reporting Regressions
---------------------
-Every regression that is found during the tests (as per the criteria above),
+Every regression found during the tests (as per the criteria above)
should be filled in a bug in GitHub and added to the release milestone.
-If a bug can't be reproduced, or stops being a blocker, it should be removed
+If a bug can't be reproduced or stops being a blocker, it should be removed
from the Milestone. Debugging can continue, but on trunk.
Backport Requests
@@ -299,15 +299,15 @@ This section describes how to triage bug reports:
to see the list of bugs that are being considered for the release.
#. Review each bug and first check if it has been fixed in main. If it has, update
- its status to "Needs Pull Request", and create a pull request for the fix
- using the /cherry-pick or /branch comments if this has not been done already.
+ its status to "Needs Pull Request" and create a pull request for the fix
+ using the ``/cherry-pick`` or ``/branch`` comments if this has not been done already.
#. If a bug has been fixed and has a pull request created for backporting it,
then update its status to "Needs Review" and notify a knowledgeable
reviewer. Usually you will want to notify the person who approved the
patch, but you may use your best judgement on who a good reviewer would be.
Once you have identified the reviewer(s), assign the issue to them and
- mention them (i.e @username) in a comment and ask them if the patch is safe
+ mention them (i.e., ``@username``) in a comment and ask them if the patch is safe
to backport. You should also review the bug yourself to ensure that it
meets the requirements for committing to the release branch.
@@ -323,11 +323,11 @@ Release Patch Rules
Below are the rules regarding patching the release branch:
#. Patches applied to the release branch may only be applied by the release
- manager, the official release testers or the maintainers with approval from
+ manager, the official release testers, or the maintainers with approval from
the release manager.
#. Release managers are encouraged, but not required, to get approval from a
- maintainer before approving patches. If there are no reachable maintainers
+ maintainer before approving patches. If there are no reachable maintainers,
then release managers can ask approval from patch reviewers or other
developers active in that area.
@@ -336,7 +336,7 @@ Below are the rules regarding patching the release branch:
was created. As with all phases, release managers and maintainers can reject
patches that are deemed too invasive.
-#. *Before RC2/RC3* Patches should be limited to bug fixes or backend specific
+#. *Before RC2/RC3* Patches should be limited to bug fixes or backend-specific
improvements that are determined to be very safe.
#. *Before Final Major Release* Patches should be limited to critical
@@ -349,7 +349,7 @@ Below are the rules regarding patching the release branch:
Release Final Tasks
-------------------
-The final stages of the release process involves tagging the "final" release
+The final stages of the release process involve tagging the "final" release
branch, updating documentation that refers to the release, and updating the
demo page.
@@ -394,11 +394,11 @@ is what to do:
#. Update the ``releases/index.html`` with the new release and link to release
documentation.
-#. After you push the changes to the www-releases repo, someone with admin
- access must login to prereleases-origin.llvm.org and manually pull the new
- changes into /data/www-releases/. This is where the website is served from.
+#. After you push the changes to the ``www-releases`` repo, someone with admin
+ access must log in to ``prereleases-origin.llvm.org`` and manually pull the new
+ changes into ``/data/www-releases/``. This is where the website is served from.
-#. Finally checkout the llvm-www repo and update the main page
+#. Finally, check out the ``llvm-www`` repo and update the main page
(``index.html`` and sidebar) to point to the new release and release
announcement.
@@ -414,5 +414,5 @@ using this command and add it to the post.
$ git log --format="- %aN: [%s (%h)](https://github.com/llvm/llvm-project/commit/%H)" llvmorg-X.1.N-1..llvmorg-X.1.N
-Once the release has been announced add a link to the announcement on the llvm
-homepage (from the llvm-www repo) in the "Release Emails" section.
+Once the release has been announced, add a link to the announcement on the llvm
+homepage (from the ``llvm-www`` repo) in the "Release Emails" section.
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 8b6c25c..5b4b53d 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -7517,12 +7517,12 @@ sections that the user does not want removed after linking.
'``unpredictable``' Metadata
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-``unpredictable`` metadata may be attached to any branch or switch
-instruction. It can be used to express the unpredictability of control
-flow. Similar to the ``llvm.expect`` intrinsic, it may be used to alter
-optimizations related to compare and branch instructions. The metadata
-is treated as a boolean value; if it exists, it signals that the branch
-or switch that it is attached to is completely unpredictable.
+``unpredictable`` metadata may be attached to any branch, select, or switch
+instruction. It can be used to express the unpredictability of control flow.
+Similar to the ``llvm.expect`` intrinsic, it may be used to alter optimizations
+related to compare and branch instructions. The metadata is treated as a
+boolean value; if it exists, it signals that the branch, select, or switch that
+it is attached to is completely unpredictable.
.. _md_dereferenceable:
@@ -21062,33 +21062,36 @@ integer element type.
Syntax:
"""""""
-This is an overloaded intrinsic.
+This is an overloaded intrinsic. You can use ``llvm.matrix.column.major.load``
+to load any vector type with a stride of any bitwidth up to 64.
::
- declare vectorty @llvm.matrix.column.major.load.*(
+ declare <4 x i32> @llvm.matrix.column.major.load.v4i32.i64(
ptrty %Ptr, i64 %Stride, i1 <IsVolatile>, i32 <Rows>, i32 <Cols>)
+ declare <9 x double> @llvm.matrix.column.major.load.v9f64.i32(
+ ptrty %Ptr, i32 %Stride, i1 <IsVolatile>, i32 <Rows>, i32 <Cols>)
Overview:
"""""""""
The '``llvm.matrix.column.major.load.*``' intrinsics load a ``<Rows> x <Cols>``
matrix using a stride of ``%Stride`` to compute the start address of the
-different columns. The offset is computed using ``%Stride``'s bitwidth. This
-allows for convenient loading of sub matrixes. If ``<IsVolatile>`` is true, the
-intrinsic is considered a :ref:`volatile memory access <volatile>`. The result
-matrix is returned in the result vector. If the ``%Ptr`` argument is known to
-be aligned to some boundary, this can be specified as an attribute on the
-argument.
+different columns. This allows for convenient loading of sub matrixes.
+Independent of ``%Stride``'s bitwidth, the offset is computed using the target
+daya layout's pointer index type. If ``<IsVolatile>`` is true, the intrinsic is
+considered a :ref:`volatile memory access <volatile>`. The result matrix is
+returned in the result vector. If the ``%Ptr`` argument is known to be aligned
+to some boundary, this can be specified as an attribute on the argument.
Arguments:
""""""""""
The first argument ``%Ptr`` is a pointer type to the returned vector type, and
corresponds to the start address to load from. The second argument ``%Stride``
-is a positive, constant integer with ``%Stride >= <Rows>``. ``%Stride`` is used
-to compute the column memory addresses. I.e., for a column ``C``, its start
-memory addresses is calculated with ``%Ptr + C * %Stride``. The third Argument
+is a positive integer for which ``%Stride >= <Rows>``. ``%Stride`` is used to
+compute the column memory addresses. I.e., for a column ``C``, its start memory
+addresses is calculated with ``%Ptr + C * %Stride``. The third Argument
``<IsVolatile>`` is a boolean value. The fourth and fifth arguments,
``<Rows>`` and ``<Cols>``, correspond to the number of rows and columns,
respectively, and must be positive, constant integers. The returned vector must
@@ -21103,20 +21106,26 @@ The :ref:`align <attr_align>` parameter attribute can be provided for the
Syntax:
"""""""
+This is an overloaded intrinsic. ``llvm.matrix.column.major.store`` to store
+any vector type with a stride of any bitwidth up to 64.
::
- declare void @llvm.matrix.column.major.store.*(
- vectorty %In, ptrty %Ptr, i64 %Stride, i1 <IsVolatile>, i32 <Rows>, i32 <Cols>)
+ declare void @llvm.matrix.column.major.store.v4i32.i64(
+ <4 x i32> %In, ptrty %Ptr, i64 %Stride, i1 <IsVolatile>, i32 <Rows>,
+ i32 <Cols>)
+ declare void @llvm.matrix.column.major.store.v9f64.i32(
+ <9 x double> %In, ptrty %Ptr, i32 %Stride, i1 <IsVolatile>, i32
+ <Rows>, i32 <Cols>)
Overview:
"""""""""
The '``llvm.matrix.column.major.store.*``' intrinsics store the ``<Rows> x
<Cols>`` matrix in ``%In`` to memory using a stride of ``%Stride`` between
-columns. The offset is computed using ``%Stride``'s bitwidth. If
-``<IsVolatile>`` is true, the intrinsic is considered a
-:ref:`volatile memory access <volatile>`.
+columns. Independent of ``%Stride``'s bitwidth, the offset is computed using
+the target daya layout's pointer index type. If ``<IsVolatile>`` is true, the
+intrinsic is considered a :ref:`volatile memory access <volatile>`.
If the ``%Ptr`` argument is known to be aligned to some boundary, this can be
specified as an attribute on the argument.
@@ -21127,7 +21136,7 @@ Arguments:
The first argument ``%In`` is a vector that corresponds to a ``<Rows> x
<Cols>`` matrix to be stored to memory. The second argument ``%Ptr`` is a
pointer to the vector type of ``%In``, and is the start address of the matrix
-in memory. The third argument ``%Stride`` is a positive, constant integer with
+in memory. The third argument ``%Stride`` is a positive integer for which
``%Stride >= <Rows>``. ``%Stride`` is used to compute the column memory
addresses. I.e., for a column ``C``, its start memory addresses is calculated
with ``%Ptr + C * %Stride``. The fourth argument ``<IsVolatile>`` is a boolean
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 79d93d0..9cdd983 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -134,9 +134,14 @@ Changes to the WebAssembly Backend
Changes to the Windows Target
-----------------------------
+* `-fpseudo-probe-for-profiling` is now supported for COFF.
+
Changes to the X86 Backend
--------------------------
+* `-mcpu=wildcatlake` is now supported.
+* `-mcpu=novalake` is now supported.
+
Changes to the OCaml bindings
-----------------------------
@@ -147,6 +152,7 @@ Changes to the C API
--------------------
* Add `LLVMGetOrInsertFunction` to get or insert a function, replacing the combination of `LLVMGetNamedFunction` and `LLVMAddFunction`.
+* Allow `LLVMGetVolatile` to work with any kind of Instruction.
Changes to the CodeGen infrastructure
-------------------------------------
@@ -160,6 +166,8 @@ Changes to the Debug Info
Changes to the LLVM tools
---------------------------------
+* `llvm-profgen` now supports decoding pseudo probe for COFF binaries.
+
* `llvm-readelf` now dumps all hex format values in lower-case mode.
* Some code paths for supporting Python 2.7 in `llvm-lit` have been removed.
* Support for `%T` in lit has been removed.
@@ -169,6 +177,12 @@ Changes to LLDB
* LLDB can now set breakpoints, show backtraces, and display variables when
debugging Wasm with supported runtimes (WAMR and V8).
+* LLDB no longer stops processes by default when receiving SIGWINCH signals
+ (window resize events) on Linux. This is the default on other Unix platforms.
+ You can re-enable it using `process handle --notify=true --stop=true SIGWINCH`.
+* The `show-progress` setting, which became a NOOP with the introduction of the
+ statusline, now defaults to off and controls using OSC escape codes to show a
+ native progress bar in supporting terminals like Ghostty and ConEmu.
Changes to BOLT
---------------------------------
diff --git a/llvm/docs/SPIRVUsage.rst b/llvm/docs/SPIRVUsage.rst
index d2d6646..85eeabf 100644
--- a/llvm/docs/SPIRVUsage.rst
+++ b/llvm/docs/SPIRVUsage.rst
@@ -235,6 +235,8 @@ Below is a list of supported SPIR-V extensions, sorted alphabetically by their e
- Adds execution modes and decorations to control floating-point computations in both kernels and shaders. It can be used on whole modules and individual instructions.
* - ``SPV_INTEL_predicated_io``
- Adds predicated load and store instructions that conditionally read from or write to memory based on a boolean predicate.
+ * - ``SPV_KHR_maximal_reconvergence``
+ - Adds execution mode and capability to enable maximal reconvergence.
SPIR-V representation in LLVM IR
================================
diff --git a/llvm/docs/TableGen/BackEnds.rst b/llvm/docs/TableGen/BackEnds.rst
index 14232bc..7f57137 100644
--- a/llvm/docs/TableGen/BackEnds.rst
+++ b/llvm/docs/TableGen/BackEnds.rst
@@ -48,7 +48,7 @@ the TableGen files, the back-ends and their users.
For instance, a global contract is that each back-end produces macro-guarded
sections. Based on whether the file is included by a header or a source file,
or even in which context of each file the include is being used, you have
-todefine a macro just before including it, to get the right output:
+to define a macro just before including it, to get the right output:
.. code-block:: c++
@@ -80,8 +80,8 @@ in the TableGen files.
CodeEmitter
-----------
-**Purpose**: CodeEmitterGen uses the descriptions of instructions and their fields to
-construct an automated code emitter: a function that, given a MachineInstr,
+**Purpose**: ``CodeEmitterGen`` uses the descriptions of instructions and their fields to
+construct an automated code emitter: a function that, given a ``MachineInstr``,
returns the (currently, 32-bit unsigned) value of the instruction.
**Output**: C++ code, implementing the target's CodeEmitter
@@ -130,7 +130,7 @@ AsmMatcher
----------
**Purpose**: Emits a target specifier matcher for
-converting parsed assembly operands in the MCInst structures. It also
+converting parsed assembly operands in the ``MCInst`` structures. It also
emits a matcher for custom operand parsing. Extensive documentation is
written on the ``AsmMatcherEmitter.cpp`` file.
@@ -167,7 +167,7 @@ CallingConv
conventions supported by this target.
**Output**: Implement static functions to deal with calling conventions
-chained by matching styles, returning false on no match.
+chained by matching styles, returning ``false`` on no match.
**Usage**: Used in ISelLowering and FastIsel as function pointers to
implementation returned by a CC selection function.
@@ -200,7 +200,7 @@ FastISel
**Purpose**: This tablegen backend emits code for use by the "fast"
instruction selection algorithm. See the comments at the top of
-lib/CodeGen/SelectionDAG/FastISel.cpp for background. This file
+``lib/CodeGen/SelectionDAG/FastISel.cpp`` for background. This file
scans through the target's tablegen instruction-info files
and extracts instructions with obvious-looking patterns, and it emits
code to look up these instructions by type and operator.
@@ -270,23 +270,23 @@ This file is included as part of ``Attr.h``.
ClangAttrParserStringSwitches
-----------------------------
-**Purpose**: Creates AttrParserStringSwitches.inc, which contains
-StringSwitch::Case statements for parser-related string switches. Each switch
+**Purpose**: Creates ``AttrParserStringSwitches.inc``, which contains
+``StringSwitch::Case`` statements for parser-related string switches. Each switch
is given its own macro (such as ``CLANG_ATTR_ARG_CONTEXT_LIST``, or
``CLANG_ATTR_IDENTIFIER_ARG_LIST``), which is expected to be defined before
-including AttrParserStringSwitches.inc, and undefined after.
+including ``AttrParserStringSwitches.inc``, and undefined after.
ClangAttrImpl
-------------
-**Purpose**: Creates AttrImpl.inc, which contains semantic attribute class
+**Purpose**: Creates ``AttrImpl.inc``, which contains semantic attribute class
definitions for any attribute in ``Attr.td`` that has not set ``ASTNode = 0``.
This file is included as part of ``AttrImpl.cpp``.
ClangAttrList
-------------
-**Purpose**: Creates AttrList.inc, which is used when a list of semantic
+**Purpose**: Creates ``AttrList.inc``, which is used when a list of semantic
attribute identifiers is required. For instance, ``AttrKinds.h`` includes this
file to generate the list of ``attr::Kind`` enumeration values. This list is
separated out into multiple categories: attributes, inheritable attributes, and
@@ -297,25 +297,25 @@ functionality required for ``dyn_cast`` and similar APIs.
ClangAttrPCHRead
----------------
-**Purpose**: Creates AttrPCHRead.inc, which is used to deserialize attributes
+**Purpose**: Creates ``AttrPCHRead.inc``, which is used to deserialize attributes
in the ``ASTReader::ReadAttributes`` function.
ClangAttrPCHWrite
-----------------
-**Purpose**: Creates AttrPCHWrite.inc, which is used to serialize attributes in
+**Purpose**: Creates ``AttrPCHWrite.inc``, which is used to serialize attributes in
the ``ASTWriter::WriteAttributes`` function.
ClangAttrSpellings
---------------------
-**Purpose**: Creates AttrSpellings.inc, which is used to implement the
+**Purpose**: Creates ``AttrSpellings.inc``, which is used to implement the
``__has_attribute`` feature test macro.
ClangAttrSpellingListIndex
--------------------------
-**Purpose**: Creates AttrSpellingListIndex.inc, which is used to map parsed
+**Purpose**: Creates ``AttrSpellingListIndex.inc``, which is used to map parsed
attribute spellings (including which syntax or scope was used) to an attribute
spelling list index. These spelling list index values are internal
implementation details exposed via
@@ -324,26 +324,26 @@ implementation details exposed via
ClangAttrVisitor
-------------------
-**Purpose**: Creates AttrVisitor.inc, which is used when implementing
+**Purpose**: Creates ``AttrVisitor.inc``, which is used when implementing
recursive AST visitors.
ClangAttrTemplateInstantiate
----------------------------
-**Purpose**: Creates AttrTemplateInstantiate.inc, which implements the
+**Purpose**: Creates ``AttrTemplateInstantiate.inc``, which implements the
``instantiateTemplateAttribute`` function, used when instantiating a template
that requires an attribute to be cloned.
ClangAttrParsedAttrList
-----------------------
-**Purpose**: Creates AttrParsedAttrList.inc, which is used to generate the
+**Purpose**: Creates ``AttrParsedAttrList.inc``, which is used to generate the
``AttributeList::Kind`` parsed attribute enumeration.
ClangAttrParsedAttrImpl
-----------------------
-**Purpose**: Creates AttrParsedAttrImpl.inc, which is used by
+**Purpose**: Creates ``AttrParsedAttrImpl.inc``, which is used by
``AttributeList.cpp`` to implement several functions on the ``AttributeList``
class. This functionality is implemented via the ``AttrInfoMap ParsedAttrInfo``
array, which contains one element per parsed attribute object.
@@ -351,14 +351,14 @@ array, which contains one element per parsed attribute object.
ClangAttrParsedAttrKinds
------------------------
-**Purpose**: Creates AttrParsedAttrKinds.inc, which is used to implement the
+**Purpose**: Creates ``AttrParsedAttrKinds.inc``, which is used to implement the
``AttributeList::getKind`` function, mapping a string (and syntax) to a parsed
attribute ``AttributeList::Kind`` enumeration.
ClangAttrDump
-------------
-**Purpose**: Creates AttrDump.inc, which dumps information about an attribute.
+**Purpose**: Creates ``AttrDump.inc``, which dumps information about an attribute.
It is used to implement ``ASTDumper::dumpAttr``.
ClangDiagsDefs
@@ -424,7 +424,7 @@ Generate list of commands that are used in documentation comments.
ArmNeon
-------
-Generate arm_neon.h for clang.
+Generate ``arm_neon.h`` for clang.
ArmNeonSema
-----------
@@ -473,7 +473,7 @@ to a built-in backend.
**Output**:
-The root of the output file is a JSON object (i.e. dictionary),
+The root of the output file is a JSON object (i.e., dictionary),
containing the following fixed keys:
* ``!tablegen_json_version``: a numeric version field that will
@@ -520,7 +520,7 @@ conventions described below.
Some TableGen data types are translated directly into the
corresponding JSON type:
-* A completely undefined value (e.g. for a variable declared without
+* A completely undefined value (e.g., for a variable declared without
initializer in some superclass of this record, and never initialized
by the record itself or any other superclass) is emitted as the JSON
``null`` value.
@@ -964,7 +964,7 @@ Here is the modified lookup function.
The new lookup function will return an iterator range with first pointer to the
first result and the last pointer to the last matching result from the table.
-However, please note that the support for emitting modified definition exists
+However, please note that the support for emitting a modified definition exists
for ``PrimaryKeyName`` only.
The ``PrimaryKeyEarlyOut`` field, when set to 1, modifies the lookup
diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h
index 3d22f859..4e380d9 100644
--- a/llvm/include/llvm-c/Core.h
+++ b/llvm/include/llvm-c/Core.h
@@ -4757,7 +4757,7 @@ LLVM_C_ABI LLVMValueRef LLVMBuildGlobalString(LLVMBuilderRef B, const char *Str,
LLVM_C_ABI LLVMValueRef LLVMBuildGlobalStringPtr(LLVMBuilderRef B,
const char *Str,
const char *Name);
-LLVM_C_ABI LLVMBool LLVMGetVolatile(LLVMValueRef MemoryAccessInst);
+LLVM_C_ABI LLVMBool LLVMGetVolatile(LLVMValueRef Inst);
LLVM_C_ABI void LLVMSetVolatile(LLVMValueRef MemoryAccessInst,
LLVMBool IsVolatile);
LLVM_C_ABI LLVMBool LLVMGetWeak(LLVMValueRef CmpXchgInst);
diff --git a/llvm/include/llvm-c/DebugInfo.h b/llvm/include/llvm-c/DebugInfo.h
index 2ecd69a..70da3a6 100644
--- a/llvm/include/llvm-c/DebugInfo.h
+++ b/llvm/include/llvm-c/DebugInfo.h
@@ -204,6 +204,11 @@ enum {
typedef unsigned LLVMMetadataKind;
/**
+ * The kind of checksum to emit.
+ */
+typedef enum { CSK_MD5, CSK_SHA1, CSK_SHA256 } LLVMChecksumKind;
+
+/**
* An LLVM DWARF type encoding.
*/
typedef unsigned LLVMDWARFTypeEncoding;
@@ -327,6 +332,25 @@ LLVM_C_ABI LLVMMetadataRef LLVMDIBuilderCreateFile(LLVMDIBuilderRef Builder,
size_t DirectoryLen);
/**
+ * Create a file descriptor to hold debugging information for a file.
+ * \param Builder The \c DIBuilder.
+ * \param Filename File name.
+ * \param FilenameLen The length of the C string passed to \c Filename.
+ * \param Directory Directory.
+ * \param DirectoryLen The length of the C string passed to \c Directory.
+ * \param ChecksumKind The kind of checksum. eg MD5, SHA256
+ * \param Checksum The checksum.
+ * \param ChecksumLen The length of the checksum.
+ * \param Souce The embedded source.
+ * \param SourceLen The length of the source.
+ */
+LLVM_C_ABI LLVMMetadataRef LLVMDIBuilderCreateFileWithChecksum(
+ LLVMDIBuilderRef Builder, const char *Filename, size_t FilenameLen,
+ const char *Directory, size_t DirectoryLen, LLVMChecksumKind ChecksumKind,
+ const char *Checksum, size_t ChecksumLen, const char *Source,
+ size_t SourceLen);
+
+/**
* Creates a new descriptor for a module with the specified parent scope.
* \param Builder The \c DIBuilder.
* \param ParentScope The parent scope containing this module declaration.
diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h
index 6bc6284..a1bfce7 100644
--- a/llvm/include/llvm/ADT/APFloat.h
+++ b/llvm/include/llvm/ADT/APFloat.h
@@ -1313,7 +1313,7 @@ public:
/// Assuming this is an IEEE-754 NaN value, quiet its signaling bit.
/// This preserves the sign and payload bits.
- APFloat makeQuiet() const {
+ [[nodiscard]] APFloat makeQuiet() const {
APFloat Result(*this);
Result.getIEEE().makeQuiet();
return Result;
diff --git a/llvm/include/llvm/ADT/Bitfields.h b/llvm/include/llvm/ADT/Bitfields.h
index 4064d71..1af2761 100644
--- a/llvm/include/llvm/ADT/Bitfields.h
+++ b/llvm/include/llvm/ADT/Bitfields.h
@@ -86,89 +86,43 @@
#include <limits> // numeric_limits
#include <type_traits>
+#include "llvm/Support/MathExtras.h"
+
namespace llvm {
namespace bitfields_details {
-/// A struct defining useful bit patterns for n-bits integer types.
-template <typename T, unsigned Bits> struct BitPatterns {
- /// Bit patterns are forged using the equivalent `Unsigned` type because of
- /// undefined operations over signed types (e.g. Bitwise shift operators).
- /// Moreover same size casting from unsigned to signed is well defined but not
- /// the other way around.
- using Unsigned = std::make_unsigned_t<T>;
- static_assert(sizeof(Unsigned) == sizeof(T), "Types must have same size");
-
- static constexpr unsigned TypeBits = sizeof(Unsigned) * CHAR_BIT;
- static_assert(TypeBits >= Bits, "n-bit must fit in T");
-
- /// e.g. with TypeBits == 8 and Bits == 6.
- static constexpr Unsigned AllZeros = Unsigned(0); // 00000000
- static constexpr Unsigned AllOnes = ~Unsigned(0); // 11111111
- static constexpr Unsigned Umin = AllZeros; // 00000000
- static constexpr Unsigned Umax = AllOnes >> (TypeBits - Bits); // 00111111
- static constexpr Unsigned SignBitMask = Unsigned(1) << (Bits - 1); // 00100000
- static constexpr Unsigned Smax = Umax >> 1U; // 00011111
- static constexpr Unsigned Smin = ~Smax; // 11100000
- static constexpr Unsigned SignExtend = Unsigned(Smin << 1U); // 11000000
-};
-
-/// `Compressor` is used to manipulate the bits of a (possibly signed) integer
-/// type so it can be packed and unpacked into a `bits` sized integer,
-/// `Compressor` is specialized on signed-ness so no runtime cost is incurred.
-/// The `pack` method also checks that the passed in `UserValue` is valid.
-template <typename T, unsigned Bits, bool = std::is_unsigned<T>::value>
-struct Compressor {
- static_assert(std::is_unsigned<T>::value, "T must be unsigned");
- using BP = BitPatterns<T, Bits>;
-
- static T pack(T UserValue, T UserMaxValue) {
- assert(UserValue <= UserMaxValue && "value is too big");
- assert(UserValue <= BP::Umax && "value is too big");
- return UserValue;
- }
-
- static T unpack(T StorageValue) { return StorageValue; }
-};
-
-template <typename T, unsigned Bits> struct Compressor<T, Bits, false> {
- static_assert(std::is_signed<T>::value, "T must be signed");
- using BP = BitPatterns<T, Bits>;
-
- static T pack(T UserValue, T UserMaxValue) {
- assert(UserValue <= UserMaxValue && "value is too big");
- assert(UserValue <= T(BP::Smax) && "value is too big");
- assert(UserValue >= T(BP::Smin) && "value is too small");
- if (UserValue < 0)
- UserValue &= ~BP::SignExtend;
- return UserValue;
- }
-
- static T unpack(T StorageValue) {
- if (StorageValue >= T(BP::SignBitMask))
- StorageValue |= BP::SignExtend;
- return StorageValue;
- }
-};
-
/// Impl is where Bifield description and Storage are put together to interact
/// with values.
template <typename Bitfield, typename StorageType> struct Impl {
static_assert(std::is_unsigned<StorageType>::value,
"Storage must be unsigned");
using IntegerType = typename Bitfield::IntegerType;
- using C = Compressor<IntegerType, Bitfield::Bits>;
- using BP = BitPatterns<StorageType, Bitfield::Bits>;
static constexpr size_t StorageBits = sizeof(StorageType) * CHAR_BIT;
static_assert(Bitfield::FirstBit <= StorageBits, "Data must fit in mask");
static_assert(Bitfield::LastBit <= StorageBits, "Data must fit in mask");
- static constexpr StorageType Mask = BP::Umax << Bitfield::Shift;
+ static constexpr StorageType LowMask =
+ maskTrailingOnes<StorageType>(Bitfield::Bits);
+ static constexpr StorageType Mask = LowMask << Bitfield::Shift;
+
+ /// Validates that `UserValue` fits within the bitfield's range.
+ static void checkValue(IntegerType UserValue, IntegerType UserMaxValue) {
+ assert(UserValue <= UserMaxValue && "value is too big");
+ if constexpr (std::is_unsigned_v<IntegerType>) {
+ assert(isUInt<Bitfield::Bits>(UserValue) && "value is too big");
+ } else {
+ static_assert(std::is_signed_v<IntegerType>,
+ "IntegerType must be signed");
+ assert(isInt<Bitfield::Bits>(UserValue) && "value is out of range");
+ }
+ }
/// Checks `UserValue` is within bounds and packs it between `FirstBit` and
/// `LastBit` of `Packed` leaving the rest unchanged.
static void update(StorageType &Packed, IntegerType UserValue) {
- const StorageType StorageValue = C::pack(UserValue, Bitfield::UserMaxValue);
+ checkValue(UserValue, Bitfield::UserMaxValue);
+ const StorageType StorageValue = UserValue & LowMask;
Packed &= ~Mask;
Packed |= StorageValue << Bitfield::Shift;
}
@@ -177,7 +131,9 @@ template <typename Bitfield, typename StorageType> struct Impl {
/// an`IntegerType`.
static IntegerType extract(StorageType Packed) {
const StorageType StorageValue = (Packed & Mask) >> Bitfield::Shift;
- return C::unpack(StorageValue);
+ if constexpr (std::is_signed_v<IntegerType>)
+ return SignExtend64<Bitfield::Bits>(StorageValue);
+ return StorageValue;
}
/// Interprets bits between `FirstBit` and `LastBit` of `Packed` as
diff --git a/llvm/include/llvm/ADT/StringExtras.h b/llvm/include/llvm/ADT/StringExtras.h
index 7d81c63..2440e76 100644
--- a/llvm/include/llvm/ADT/StringExtras.h
+++ b/llvm/include/llvm/ADT/StringExtras.h
@@ -529,13 +529,15 @@ inline std::string join_items(Sep Separator, Args &&... Items) {
class ListSeparator {
bool First = true;
StringRef Separator;
+ StringRef Prefix;
public:
- ListSeparator(StringRef Separator = ", ") : Separator(Separator) {}
+ ListSeparator(StringRef Separator = ", ", StringRef Prefix = "")
+ : Separator(Separator), Prefix(Prefix) {}
operator StringRef() {
if (First) {
First = false;
- return {};
+ return Prefix;
}
return Separator;
}
diff --git a/llvm/include/llvm/ADT/StringSwitch.h b/llvm/include/llvm/ADT/StringSwitch.h
index 0ce7c57a..26d5682 100644
--- a/llvm/include/llvm/ADT/StringSwitch.h
+++ b/llvm/include/llvm/ADT/StringSwitch.h
@@ -14,9 +14,11 @@
#define LLVM_ADT_STRINGSWITCH_H
#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
#include <cassert>
#include <cstring>
+#include <initializer_list>
#include <optional>
namespace llvm {
@@ -37,7 +39,7 @@ namespace llvm {
/// .Case("green", Green)
/// .Case("blue", Blue)
/// .Case("indigo", Indigo)
-/// .Cases("violet", "purple", Violet)
+/// .Cases({"violet", "purple"}, Violet)
/// .Default(UnknownColor);
/// \endcode
template<typename T, typename R = T>
@@ -53,21 +55,18 @@ public:
explicit StringSwitch(StringRef S)
: Str(S), Result() { }
+ StringSwitch(StringSwitch &&) = default;
+
// StringSwitch is not copyable.
StringSwitch(const StringSwitch &) = delete;
// StringSwitch is not assignable due to 'Str' being 'const'.
void operator=(const StringSwitch &) = delete;
- void operator=(StringSwitch &&other) = delete;
-
- StringSwitch(StringSwitch &&other)
- : Str(other.Str), Result(std::move(other.Result)) { }
-
- ~StringSwitch() = default;
+ void operator=(StringSwitch &&) = delete;
// Case-sensitive case matchers
StringSwitch &Case(StringLiteral S, T Value) {
- CaseImpl(Value, S);
+ CaseImpl(S, Value);
return *this;
}
@@ -85,60 +84,70 @@ public:
return *this;
}
+ StringSwitch &Cases(std::initializer_list<StringLiteral> CaseStrings,
+ T Value) {
+ return CasesImpl(CaseStrings, Value);
+ }
+
StringSwitch &Cases(StringLiteral S0, StringLiteral S1, T Value) {
- return CasesImpl(Value, S0, S1);
+ return CasesImpl({S0, S1}, Value);
}
StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
T Value) {
- return CasesImpl(Value, S0, S1, S2);
+ return CasesImpl({S0, S1, S2}, Value);
}
StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
StringLiteral S3, T Value) {
- return CasesImpl(Value, S0, S1, S2, S3);
+ return CasesImpl({S0, S1, S2, S3}, Value);
}
StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
StringLiteral S3, StringLiteral S4, T Value) {
- return CasesImpl(Value, S0, S1, S2, S3, S4);
+ return CasesImpl({S0, S1, S2, S3, S4}, Value);
}
+ [[deprecated("Pass cases in std::initializer_list instead")]]
StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
StringLiteral S3, StringLiteral S4, StringLiteral S5,
T Value) {
- return CasesImpl(Value, S0, S1, S2, S3, S4, S5);
+ return CasesImpl({S0, S1, S2, S3, S4, S5}, Value);
}
+ [[deprecated("Pass cases in std::initializer_list instead")]]
StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
StringLiteral S3, StringLiteral S4, StringLiteral S5,
StringLiteral S6, T Value) {
- return CasesImpl(Value, S0, S1, S2, S3, S4, S5, S6);
+ return CasesImpl({S0, S1, S2, S3, S4, S5, S6}, Value);
}
+ [[deprecated("Pass cases in std::initializer_list instead")]]
StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
StringLiteral S3, StringLiteral S4, StringLiteral S5,
StringLiteral S6, StringLiteral S7, T Value) {
- return CasesImpl(Value, S0, S1, S2, S3, S4, S5, S6, S7);
+ return CasesImpl({S0, S1, S2, S3, S4, S5, S6, S7}, Value);
}
+ [[deprecated("Pass cases in std::initializer_list instead")]]
StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
StringLiteral S3, StringLiteral S4, StringLiteral S5,
StringLiteral S6, StringLiteral S7, StringLiteral S8,
T Value) {
- return CasesImpl(Value, S0, S1, S2, S3, S4, S5, S6, S7, S8);
+ return CasesImpl({S0, S1, S2, S3, S4, S5, S6, S7, S8}, Value);
}
+ [[deprecated("Pass cases in std::initializer_list instead")]]
StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
StringLiteral S3, StringLiteral S4, StringLiteral S5,
StringLiteral S6, StringLiteral S7, StringLiteral S8,
StringLiteral S9, T Value) {
- return CasesImpl(Value, S0, S1, S2, S3, S4, S5, S6, S7, S8, S9);
+ return CasesImpl({S0, S1, S2, S3, S4, S5, S6, S7, S8, S9}, Value);
}
// Case-insensitive case matchers.
StringSwitch &CaseLower(StringLiteral S, T Value) {
- CaseLowerImpl(Value, S);
+ CaseLowerImpl(S, Value);
return *this;
}
@@ -156,23 +165,28 @@ public:
return *this;
}
+ StringSwitch &CasesLower(std::initializer_list<StringLiteral> CaseStrings,
+ T Value) {
+ return CasesLowerImpl(CaseStrings, Value);
+ }
+
StringSwitch &CasesLower(StringLiteral S0, StringLiteral S1, T Value) {
- return CasesLowerImpl(Value, S0, S1);
+ return CasesLowerImpl({S0, S1}, Value);
}
StringSwitch &CasesLower(StringLiteral S0, StringLiteral S1, StringLiteral S2,
T Value) {
- return CasesLowerImpl(Value, S0, S1, S2);
+ return CasesLowerImpl({S0, S1, S2}, Value);
}
StringSwitch &CasesLower(StringLiteral S0, StringLiteral S1, StringLiteral S2,
StringLiteral S3, T Value) {
- return CasesLowerImpl(Value, S0, S1, S2, S3);
+ return CasesLowerImpl({S0, S1, S2, S3}, Value);
}
StringSwitch &CasesLower(StringLiteral S0, StringLiteral S1, StringLiteral S2,
StringLiteral S3, StringLiteral S4, T Value) {
- return CasesLowerImpl(Value, S0, S1, S2, S3, S4);
+ return CasesLowerImpl({S0, S1, S2, S3, S4}, Value);
}
[[nodiscard]] R Default(T Value) {
@@ -193,7 +207,7 @@ public:
private:
// Returns true when `Str` matches the `S` argument, and stores the result.
- bool CaseImpl(T &Value, StringLiteral S) {
+ bool CaseImpl(StringLiteral S, T &Value) {
if (!Result && Str == S) {
Result = std::move(Value);
return true;
@@ -203,7 +217,7 @@ private:
// Returns true when `Str` matches the `S` argument (case-insensitive), and
// stores the result.
- bool CaseLowerImpl(T &Value, StringLiteral S) {
+ bool CaseLowerImpl(StringLiteral S, T &Value) {
if (!Result && Str.equals_insensitive(S)) {
Result = std::move(Value);
return true;
@@ -211,16 +225,21 @@ private:
return false;
}
- template <typename... Args> StringSwitch &CasesImpl(T &Value, Args... Cases) {
+ StringSwitch &CasesImpl(std::initializer_list<StringLiteral> Cases,
+ T &Value) {
// Stop matching after the string is found.
- (... || CaseImpl(Value, Cases));
+ for (StringLiteral S : Cases)
+ if (CaseImpl(S, Value))
+ break;
return *this;
}
- template <typename... Args>
- StringSwitch &CasesLowerImpl(T &Value, Args... Cases) {
+ StringSwitch &CasesLowerImpl(std::initializer_list<StringLiteral> Cases,
+ T &Value) {
// Stop matching after the string is found.
- (... || CaseLowerImpl(Value, Cases));
+ for (StringLiteral S : Cases)
+ if (CaseLowerImpl(S, Value))
+ break;
return *this;
}
};
diff --git a/llvm/include/llvm/Analysis/DXILResource.h b/llvm/include/llvm/Analysis/DXILResource.h
index 88ac0a1..c7aff16 100644
--- a/llvm/include/llvm/Analysis/DXILResource.h
+++ b/llvm/include/llvm/Analysis/DXILResource.h
@@ -243,6 +243,25 @@ public:
}
};
+/// The dx.Padding target extension type
+///
+/// `target("dx.Padding", NumBytes)`
+class PaddingExtType : public TargetExtType {
+public:
+ PaddingExtType() = delete;
+ PaddingExtType(const PaddingExtType &) = delete;
+ PaddingExtType &operator=(const PaddingExtType &) = delete;
+
+ unsigned getNumBytes() const { return getIntParameter(0); }
+
+ static bool classof(const TargetExtType *T) {
+ return T->getName() == "dx.Padding";
+ }
+ static bool classof(const Type *T) {
+ return isa<TargetExtType>(T) && classof(cast<TargetExtType>(T));
+ }
+};
+
//===----------------------------------------------------------------------===//
class ResourceTypeInfo {
diff --git a/llvm/include/llvm/Analysis/IR2Vec.h b/llvm/include/llvm/Analysis/IR2Vec.h
index 6bc51fe..5ad6288 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -575,7 +575,7 @@ public:
/// cached embeddings should be invalidated to ensure
/// correctness/recomputation. This is a no-op for SymbolicEmbedder but
/// removes all the cached entries in FlowAwareEmbedder.
- virtual void invalidateEmbeddings() { return; }
+ virtual void invalidateEmbeddings() {}
};
/// Class for computing the Symbolic embeddings of IR2Vec.
diff --git a/llvm/include/llvm/Analysis/LoopInfo.h b/llvm/include/llvm/Analysis/LoopInfo.h
index a7a6a27..0ecb114 100644
--- a/llvm/include/llvm/Analysis/LoopInfo.h
+++ b/llvm/include/llvm/Analysis/LoopInfo.h
@@ -617,7 +617,7 @@ public:
};
/// Function to print a loop's contents as LLVM's text IR assembly.
-LLVM_ABI void printLoop(Loop &L, raw_ostream &OS,
+LLVM_ABI void printLoop(const Loop &L, raw_ostream &OS,
const std::string &Banner = "");
/// Find and return the loop attribute node for the attribute @p Name in
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index 8876e4e..e5a6c8c 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -2316,10 +2316,6 @@ private:
/// an add rec on said loop.
void getUsedLoops(const SCEV *S, SmallPtrSetImpl<const Loop *> &LoopsUsed);
- /// Try to match the pattern generated by getURemExpr(A, B). If successful,
- /// Assign A and B to LHS and RHS, respectively.
- LLVM_ABI bool matchURem(const SCEV *Expr, const SCEV *&LHS, const SCEV *&RHS);
-
/// Look for a SCEV expression with type `SCEVType` and operands `Ops` in
/// `UniqueSCEVs`. Return if found, else nullptr.
SCEV *findExistingSCEVInCache(SCEVTypes SCEVType, ArrayRef<const SCEV *> Ops);
diff --git a/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h b/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h
index 164b46b..9354eef 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h
@@ -95,6 +95,10 @@ inline bind_ty<const SCEVAddExpr> m_scev_Add(const SCEVAddExpr *&V) {
return V;
}
+inline bind_ty<const SCEVMulExpr> m_scev_Mul(const SCEVMulExpr *&V) {
+ return V;
+}
+
/// Match a specified const SCEV *.
struct specificscev_ty {
const SCEV *Expr;
@@ -182,6 +186,12 @@ m_scev_PtrToInt(const Op0_t &Op0) {
return SCEVUnaryExpr_match<SCEVPtrToIntExpr, Op0_t>(Op0);
}
+template <typename Op0_t>
+inline SCEVUnaryExpr_match<SCEVTruncateExpr, Op0_t>
+m_scev_Trunc(const Op0_t &Op0) {
+ return m_scev_Unary<SCEVTruncateExpr>(Op0);
+}
+
/// Match a binary SCEV.
template <typename SCEVTy, typename Op0_t, typename Op1_t,
SCEV::NoWrapFlags WrapFlags = SCEV::FlagAnyWrap,
@@ -246,6 +256,88 @@ m_scev_UDiv(const Op0_t &Op0, const Op1_t &Op1) {
return m_scev_Binary<SCEVUDivExpr>(Op0, Op1);
}
+template <typename Op0_t, typename Op1_t>
+inline SCEVBinaryExpr_match<SCEVSMaxExpr, Op0_t, Op1_t>
+m_scev_SMax(const Op0_t &Op0, const Op1_t &Op1) {
+ return m_scev_Binary<SCEVSMaxExpr>(Op0, Op1);
+}
+
+template <typename Op0_t, typename Op1_t>
+inline SCEVBinaryExpr_match<SCEVMinMaxExpr, Op0_t, Op1_t>
+m_scev_MinMax(const Op0_t &Op0, const Op1_t &Op1) {
+ return m_scev_Binary<SCEVMinMaxExpr>(Op0, Op1);
+}
+
+/// Match unsigned remainder pattern.
+/// Matches patterns generated by getURemExpr.
+template <typename Op0_t, typename Op1_t> struct SCEVURem_match {
+ Op0_t Op0;
+ Op1_t Op1;
+ ScalarEvolution &SE;
+
+ SCEVURem_match(Op0_t Op0, Op1_t Op1, ScalarEvolution &SE)
+ : Op0(Op0), Op1(Op1), SE(SE) {}
+
+ bool match(const SCEV *Expr) const {
+ if (Expr->getType()->isPointerTy())
+ return false;
+
+ // Try to match 'zext (trunc A to iB) to iY', which is used
+ // for URem with constant power-of-2 second operands. Make sure the size of
+ // the operand A matches the size of the whole expressions.
+ const SCEV *LHS;
+ if (SCEVPatternMatch::match(Expr, m_scev_ZExt(m_scev_Trunc(m_SCEV(LHS))))) {
+ Type *TruncTy = cast<SCEVZeroExtendExpr>(Expr)->getOperand()->getType();
+ // Bail out if the type of the LHS is larger than the type of the
+ // expression for now.
+ if (SE.getTypeSizeInBits(LHS->getType()) >
+ SE.getTypeSizeInBits(Expr->getType()))
+ return false;
+ if (LHS->getType() != Expr->getType())
+ LHS = SE.getZeroExtendExpr(LHS, Expr->getType());
+ const SCEV *RHS =
+ SE.getConstant(APInt(SE.getTypeSizeInBits(Expr->getType()), 1)
+ << SE.getTypeSizeInBits(TruncTy));
+ return Op0.match(LHS) && Op1.match(RHS);
+ }
+
+ const SCEV *A;
+ const SCEVMulExpr *Mul;
+ if (!SCEVPatternMatch::match(Expr, m_scev_Add(m_scev_Mul(Mul), m_SCEV(A))))
+ return false;
+
+ const auto MatchURemWithDivisor = [&](const SCEV *B) {
+ // (SomeExpr + (-(SomeExpr / B) * B)).
+ if (Expr == SE.getURemExpr(A, B))
+ return Op0.match(A) && Op1.match(B);
+ return false;
+ };
+
+ // (SomeExpr + (-1 * (SomeExpr / B) * B)).
+ if (Mul->getNumOperands() == 3 && isa<SCEVConstant>(Mul->getOperand(0)))
+ return MatchURemWithDivisor(Mul->getOperand(1)) ||
+ MatchURemWithDivisor(Mul->getOperand(2));
+
+ // (SomeExpr + ((-SomeExpr / B) * B)) or (SomeExpr + ((SomeExpr / B) * -B)).
+ if (Mul->getNumOperands() == 2)
+ return MatchURemWithDivisor(Mul->getOperand(1)) ||
+ MatchURemWithDivisor(Mul->getOperand(0)) ||
+ MatchURemWithDivisor(SE.getNegativeSCEV(Mul->getOperand(1))) ||
+ MatchURemWithDivisor(SE.getNegativeSCEV(Mul->getOperand(0)));
+ return false;
+ }
+};
+
+/// Match the mathematical pattern A - (A / B) * B, where A and B can be
+/// arbitrary expressions. Also match zext (trunc A to iB) to iY, which is used
+/// for URem with constant power-of-2 second operands. It's not always easy, as
+/// A and B can be folded (imagine A is X / 2, and B is 4, A / B becomes X / 8).
+template <typename Op0_t, typename Op1_t>
+inline SCEVURem_match<Op0_t, Op1_t> m_scev_URem(Op0_t LHS, Op1_t RHS,
+ ScalarEvolution &SE) {
+ return SCEVURem_match<Op0_t, Op1_t>(LHS, RHS, SE);
+}
+
inline class_match<const Loop> m_Loop() { return class_match<const Loop>(); }
/// Match an affine SCEVAddRecExpr.
diff --git a/llvm/include/llvm/Analysis/StaticDataProfileInfo.h b/llvm/include/llvm/Analysis/StaticDataProfileInfo.h
index fa21eba..ac03137 100644
--- a/llvm/include/llvm/Analysis/StaticDataProfileInfo.h
+++ b/llvm/include/llvm/Analysis/StaticDataProfileInfo.h
@@ -10,12 +10,33 @@
namespace llvm {
+namespace memprof {
+// Represents the eligibility status of a global variable for section prefix
+// annotation. Other than AnnotationOk, each enum value indicates a specific
+// reason for ineligibility.
+enum class AnnotationKind : uint8_t {
+ AnnotationOK,
+ DeclForLinker,
+ ExplicitSection,
+ ReservedName,
+};
+/// Returns the annotation kind of the global variable \p GV.
+AnnotationKind getAnnotationKind(const GlobalVariable &GV);
+
+/// Returns true if the annotation kind of the global variable \p GV is
+/// AnnotationOK.
+bool IsAnnotationOK(const GlobalVariable &GV);
+} // namespace memprof
+
/// A class that holds the constants that represent static data and their
/// profile information and provides methods to operate on them.
class StaticDataProfileInfo {
public:
- /// Accummulate the profile count of a constant that will be lowered to static
- /// data sections.
+ /// A constant is tracked only if the following conditions are met.
+ /// 1) It has local (i.e., private or internal) linkage.
+ // 2) Its data kind is one of {.rodata, .data, .bss, .data.rel.ro}.
+ // 3) It's eligible for section prefix annotation. See `AnnotationKind`
+ // above for ineligible reasons.
DenseMap<const Constant *, uint64_t> ConstantProfileCounts;
/// Keeps track of the constants that are seen at least once without profile
@@ -26,8 +47,31 @@ public:
LLVM_ABI std::optional<uint64_t>
getConstantProfileCount(const Constant *C) const;
+ /// Use signed enums for enum value comparison, and make 'LukewarmOrUnknown'
+ /// as 0 so any accidentally uninitialized value will default to unknown.
+ enum class StaticDataHotness : int8_t {
+ Cold = -1,
+ LukewarmOrUnknown = 0,
+ Hot = 1,
+ };
+
+ /// Return the hotness of the constant \p C based on its profile count \p
+ /// Count.
+ LLVM_ABI StaticDataHotness getConstantHotnessUsingProfileCount(
+ const Constant *C, const ProfileSummaryInfo *PSI, uint64_t Count) const;
+
+ /// Return the hotness based on section prefix \p SectionPrefix.
+ LLVM_ABI StaticDataHotness getSectionHotnessUsingDataAccessProfile(
+ std::optional<StringRef> SectionPrefix) const;
+
+ /// Return the string representation of the hotness enum \p Hotness.
+ LLVM_ABI StringRef hotnessToStr(StaticDataHotness Hotness) const;
+
+ bool EnableDataAccessProf = false;
+
public:
- StaticDataProfileInfo() = default;
+ StaticDataProfileInfo(bool EnableDataAccessProf)
+ : EnableDataAccessProf(EnableDataAccessProf) {}
/// If \p Count is not nullopt, add it to the profile count of the constant \p
/// C in a saturating way, and clamp the count to \p getInstrMaxCountValue if
@@ -36,14 +80,10 @@ public:
LLVM_ABI void addConstantProfileCount(const Constant *C,
std::optional<uint64_t> Count);
- /// Return a section prefix for the constant \p C based on its profile count.
- /// - If a constant doesn't have a counter, return an empty string.
- /// - Otherwise,
- /// - If it has a hot count, return "hot".
- /// - If it is seen by unprofiled function, return an empty string.
- /// - If it has a cold count, return "unlikely".
- /// - Otherwise (e.g. it's used by lukewarm functions), return an empty
- /// string.
+ /// Given a constant \p C, returns a section prefix.
+ /// If \p C is a global variable, the section prefix is the bigger one
+ /// between its existing section prefix and its use profile count. Otherwise,
+ /// the section prefix is based on its use profile count.
LLVM_ABI StringRef getConstantSectionPrefix(
const Constant *C, const ProfileSummaryInfo *PSI) const;
};
diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.h b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
index 26963ed..3f39b47 100644
--- a/llvm/include/llvm/Analysis/TargetLibraryInfo.h
+++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
@@ -373,12 +373,10 @@ public:
/// Disables all builtins.
///
/// This can be used for options like -fno-builtin.
- void disableAllFunctions() LLVM_ATTRIBUTE_UNUSED {
- OverrideAsUnavailable.set();
- }
+ [[maybe_unused]] void disableAllFunctions() { OverrideAsUnavailable.set(); }
/// Forces a function to be marked as unavailable.
- void setUnavailable(LibFunc F) LLVM_ATTRIBUTE_UNUSED {
+ [[maybe_unused]] void setUnavailable(LibFunc F) {
assert(F < OverrideAsUnavailable.size() && "out-of-bounds LibFunc");
OverrideAsUnavailable.set(F);
}
diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index e619b18..8d0dc64 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -647,6 +647,7 @@ enum {
EF_HEXAGON_ISA_V85 = 0x00000085, // Hexagon V85 ISA
EF_HEXAGON_ISA_V87 = 0x00000087, // Hexagon V87 ISA
EF_HEXAGON_ISA_V89 = 0x00000089, // Hexagon V89 ISA
+ EF_HEXAGON_ISA_V91 = 0x00000091, // Hexagon V91 ISA
EF_HEXAGON_ISA = 0x000003ff, // Hexagon V.. ISA
// Tiny core flag, bit[15]
@@ -680,6 +681,7 @@ enum {
EF_HEXAGON_MACH_V85 = EF_HEXAGON_ISA_V85, // Hexagon V85
EF_HEXAGON_MACH_V87 = EF_HEXAGON_ISA_V87, // Hexagon V87
EF_HEXAGON_MACH_V89 = EF_HEXAGON_ISA_V89, // Hexagon V89
+ EF_HEXAGON_MACH_V91 = EF_HEXAGON_ISA_V91, // Hexagon V91
EF_HEXAGON_MACH = 0x0000ffff, // Hexagon V..
};
diff --git a/llvm/include/llvm/BinaryFormat/ELFRelocs/AArch64.def b/llvm/include/llvm/BinaryFormat/ELFRelocs/AArch64.def
index 8dcc292..1cfcdbf 100644
--- a/llvm/include/llvm/BinaryFormat/ELFRelocs/AArch64.def
+++ b/llvm/include/llvm/BinaryFormat/ELFRelocs/AArch64.def
@@ -62,6 +62,7 @@ ELF_RELOC(R_AARCH64_LD64_GOTPAGE_LO15, 0x139)
ELF_RELOC(R_AARCH64_PLT32, 0x13a)
ELF_RELOC(R_AARCH64_GOTPCREL32, 0x13b)
ELF_RELOC(R_AARCH64_PATCHINST, 0x13c)
+ELF_RELOC(R_AARCH64_FUNCINIT64, 0x13d)
// General dynamic TLS relocations
ELF_RELOC(R_AARCH64_TLSGD_ADR_PREL21, 0x200)
ELF_RELOC(R_AARCH64_TLSGD_ADR_PAGE21, 0x201)
diff --git a/llvm/include/llvm/CAS/CASID.h b/llvm/include/llvm/CAS/CASID.h
index 8820994..f508ed3 100644
--- a/llvm/include/llvm/CAS/CASID.h
+++ b/llvm/include/llvm/CAS/CASID.h
@@ -95,8 +95,7 @@ public:
}
friend hash_code hash_value(const CASID &ID) {
- ArrayRef<uint8_t> Hash = ID.getHash();
- return hash_combine_range(Hash.begin(), Hash.end());
+ return hash_combine_range(ID.getHash());
}
const CASContext &getContext() const {
diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h
index 19ca444..9ace255 100644
--- a/llvm/include/llvm/CodeGen/AsmPrinter.h
+++ b/llvm/include/llvm/CodeGen/AsmPrinter.h
@@ -16,6 +16,7 @@
#define LLVM_CODEGEN_ASMPRINTER_H
#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/IntrusiveRefCntPtr.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
@@ -87,6 +88,10 @@ namespace remarks {
class RemarkStreamer;
}
+namespace vfs {
+class FileSystem;
+}
+
/// This class is intended to be used as a driving class for all asm writers.
class LLVM_ABI AsmPrinter : public MachineFunctionPass {
public:
@@ -105,6 +110,9 @@ public:
/// generating (such as the current section etc).
std::unique_ptr<MCStreamer> OutStreamer;
+ /// The VFS to resolve asm include directives.
+ IntrusiveRefCntPtr<vfs::FileSystem> VFS;
+
/// The current machine function.
MachineFunction *MF = nullptr;
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
index 9855444..51318c9 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
@@ -383,7 +383,8 @@ LLVM_ABI LegalizeMutation changeElementCountTo(unsigned TypeIdx,
/// Keep the same scalar or element type as \p TypeIdx, but take the number of
/// elements from \p Ty.
-LLVM_ABI LegalizeMutation changeElementCountTo(unsigned TypeIdx, LLT Ty);
+LLVM_ABI LegalizeMutation changeElementCountTo(unsigned TypeIdx,
+ ElementCount EC);
/// Change the scalar size or element size to have the same scalar size as type
/// index \p FromIndex. Unlike changeElementTo, this discards pointer types and
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index c76c83d..ff3dd0d 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -514,6 +514,12 @@ enum NodeType {
/// separately rounded operations.
FMAD,
+ /// FMULADD - Performs a * b + c, with, or without, intermediate rounding.
+ /// It is expected that this will be illegal for most targets, as it usually
+ /// makes sense to split this or use an FMA. But some targets, such as
+ /// WebAssembly, can directly support these semantics.
+ FMULADD,
+
/// FCOPYSIGN(X, Y) - Return the value of X with the sign of Y. NOTE: This
/// DAG node does not require that X and Y have the same type, just that
/// they are both floating point. X and the result must have the same type.
diff --git a/llvm/include/llvm/CodeGen/LiveIntervals.h b/llvm/include/llvm/CodeGen/LiveIntervals.h
index 1050b3d..c252f9d 100644
--- a/llvm/include/llvm/CodeGen/LiveIntervals.h
+++ b/llvm/include/llvm/CodeGen/LiveIntervals.h
@@ -229,8 +229,8 @@ public:
/// doing something wrong if you call pruneValue directly on a
/// LiveInterval. Indeed, you are supposed to call pruneValue on the main
/// LiveRange and all the LiveRanges of the subranges if any.
- LLVM_ATTRIBUTE_UNUSED void pruneValue(LiveInterval &, SlotIndex,
- SmallVectorImpl<SlotIndex> *) {
+ [[maybe_unused]] void pruneValue(LiveInterval &, SlotIndex,
+ SmallVectorImpl<SlotIndex> *) {
llvm_unreachable(
"Use pruneValue on the main LiveRange and on each subrange");
}
diff --git a/llvm/include/llvm/CodeGen/LiveRangeCalc.h b/llvm/include/llvm/CodeGen/LiveRangeCalc.h
index e9b62fb..67f5b69 100644
--- a/llvm/include/llvm/CodeGen/LiveRangeCalc.h
+++ b/llvm/include/llvm/CodeGen/LiveRangeCalc.h
@@ -259,7 +259,7 @@ public:
/// jointly dominated by the blocks corresponding to the slot indices
/// in @p Defs. This function is mainly for use in self-verification
/// checks.
- LLVM_ABI LLVM_ATTRIBUTE_UNUSED static bool
+ [[maybe_unused]] LLVM_ABI static bool
isJointlyDominated(const MachineBasicBlock *MBB, ArrayRef<SlotIndex> Defs,
const SlotIndexes &Indexes);
};
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 62c0806..df6ce0f 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1850,9 +1850,11 @@ public:
/// Get the specified node if it's already available, or else return NULL.
LLVM_ABI SDNode *getNodeIfExists(unsigned Opcode, SDVTList VTList,
ArrayRef<SDValue> Ops,
- const SDNodeFlags Flags);
+ const SDNodeFlags Flags,
+ bool AllowCommute = false);
LLVM_ABI SDNode *getNodeIfExists(unsigned Opcode, SDVTList VTList,
- ArrayRef<SDValue> Ops);
+ ArrayRef<SDValue> Ops,
+ bool AllowCommute = false);
/// Check if a node exists without modifying its flags.
LLVM_ABI bool doesNodeExist(unsigned Opcode, SDVTList VTList,
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 1169116..69713d0 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -1950,7 +1950,7 @@ LLVM_ABI bool isOnesOrOnesSplat(SDValue N, bool AllowUndefs = false);
/// Return true if the value is a constant 0 integer or a splatted vector of a
/// constant 0 integer (with no undefs).
-/// Does not permit build vector implicit truncation.
+/// Build vector implicit truncation is allowed.
LLVM_ABI bool isZeroOrZeroSplat(SDValue N, bool AllowUndefs = false);
/// Return true if \p V is either a integer or FP constant.
diff --git a/llvm/include/llvm/DebugInfo/GSYM/DwarfTransformer.h b/llvm/include/llvm/DebugInfo/GSYM/DwarfTransformer.h
index 77ce052..2c59a52 100644
--- a/llvm/include/llvm/DebugInfo/GSYM/DwarfTransformer.h
+++ b/llvm/include/llvm/DebugInfo/GSYM/DwarfTransformer.h
@@ -43,8 +43,14 @@ public:
///
/// \param LDCS Flag to indicate whether we should load the call site
/// information from DWARF `DW_TAG_call_site` entries
- DwarfTransformer(DWARFContext &D, GsymCreator &G, bool LDCS = false)
- : DICtx(D), Gsym(G), LoadDwarfCallSites(LDCS) {}
+ ///
+ /// \param MachO Flag to indicate if the object file is mach-o (Apple's
+ /// executable format). Apple has some compile unit attributes that look like
+ /// split DWARF, but they aren't and they can cause warnins to be emitted
+ /// about missing DWO files.
+ DwarfTransformer(DWARFContext &D, GsymCreator &G, bool LDCS = false,
+ bool MachO = false)
+ : DICtx(D), Gsym(G), LoadDwarfCallSites(LDCS), IsMachO(MachO) {}
/// Extract the DWARF from the supplied object file and convert it into the
/// Gsym format in the GsymCreator object that is passed in. Returns an
@@ -97,6 +103,7 @@ private:
DWARFContext &DICtx;
GsymCreator &Gsym;
bool LoadDwarfCallSites;
+ bool IsMachO;
friend class DwarfTransformerTest;
};
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.h b/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.h
index f9070af..eb71e9a 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.h
@@ -32,8 +32,9 @@ public:
struct SymbolAddrs {
ExecutorAddr Allocator;
ExecutorAddr Reserve;
- ExecutorAddr Finalize;
- ExecutorAddr Deallocate;
+ ExecutorAddr Initialize;
+ ExecutorAddr Deinitialize;
+ ExecutorAddr Release;
};
/// Create an EPCGenericJITLinkMemoryManager instance from a given set of
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.h b/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.h
index faec25d..fa48480 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.h
@@ -31,8 +31,8 @@ public:
struct SymbolAddrs {
ExecutorAddr Instance;
ExecutorAddr Reserve;
- ExecutorAddr Finalize;
- ExecutorAddr Deallocate;
+ ExecutorAddr Initialize;
+ ExecutorAddr Release;
ExecutorAddr RegisterEHFrame;
ExecutorAddr DeregisterEHFrame;
};
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/MemoryMapper.h b/llvm/include/llvm/ExecutionEngine/Orc/MemoryMapper.h
index 44ef289..41c3089 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/MemoryMapper.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/MemoryMapper.h
@@ -51,7 +51,11 @@ public:
virtual void reserve(size_t NumBytes, OnReservedFunction OnReserved) = 0;
/// Provides working memory
- virtual char *prepare(ExecutorAddr Addr, size_t ContentSize) = 0;
+ /// The LinkGraph parameter is included to allow implementations to allocate
+ /// working memory from the LinkGraph's allocator, in which case it will be
+ /// deallocated when the LinkGraph is destroyed.
+ virtual char *prepare(jitlink::LinkGraph &G, ExecutorAddr Addr,
+ size_t ContentSize) = 0;
using OnInitializedFunction = unique_function<void(Expected<ExecutorAddr>)>;
@@ -92,7 +96,8 @@ public:
void initialize(AllocInfo &AI, OnInitializedFunction OnInitialized) override;
- char *prepare(ExecutorAddr Addr, size_t ContentSize) override;
+ char *prepare(jitlink::LinkGraph &G, ExecutorAddr Addr,
+ size_t ContentSize) override;
void deinitialize(ArrayRef<ExecutorAddr> Allocations,
OnDeinitializedFunction OnDeInitialized) override;
@@ -142,7 +147,8 @@ public:
void reserve(size_t NumBytes, OnReservedFunction OnReserved) override;
- char *prepare(ExecutorAddr Addr, size_t ContentSize) override;
+ char *prepare(jitlink::LinkGraph &G, ExecutorAddr Addr,
+ size_t ContentSize) override;
void initialize(AllocInfo &AI, OnInitializedFunction OnInitialized) override;
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/AllocationActions.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/AllocationActions.h
index 596cc18..b0197f0 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/AllocationActions.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/AllocationActions.h
@@ -13,7 +13,6 @@
#ifndef LLVM_EXECUTIONENGINE_ORC_SHARED_ALLOCATIONACTIONS_H
#define LLVM_EXECUTIONENGINE_ORC_SHARED_ALLOCATIONACTIONS_H
-#include "llvm/ADT/FunctionExtras.h"
#include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h"
#include "llvm/ExecutionEngine/Orc/Shared/WrapperFunctionUtils.h"
#include "llvm/Support/Compiler.h"
@@ -54,9 +53,6 @@ inline size_t numDeallocActions(const AllocActions &AAs) {
AAs, [](const AllocActionCallPair &P) { return !!P.Dealloc; });
}
-using OnRunFinalizeActionsCompleteFn =
- unique_function<void(Expected<std::vector<WrapperFunctionCall>>)>;
-
/// Run finalize actions.
///
/// If any finalize action fails then the corresponding dealloc actions will be
@@ -67,16 +63,13 @@ using OnRunFinalizeActionsCompleteFn =
/// be returned. The dealloc actions should be run by calling
/// runDeallocationActions. If this function succeeds then the AA argument will
/// be cleared before the function returns.
-LLVM_ABI void runFinalizeActions(AllocActions &AAs,
- OnRunFinalizeActionsCompleteFn OnComplete);
-
-using OnRunDeallocActionsComeleteFn = unique_function<void(Error)>;
+LLVM_ABI Expected<std::vector<WrapperFunctionCall>>
+runFinalizeActions(AllocActions &AAs);
/// Run deallocation actions.
/// Dealloc actions will be run in reverse order (from last element of DAs to
/// first).
-LLVM_ABI void runDeallocActions(ArrayRef<WrapperFunctionCall> DAs,
- OnRunDeallocActionsComeleteFn OnComplete);
+LLVM_ABI Error runDeallocActions(ArrayRef<WrapperFunctionCall> DAs);
using SPSAllocActionCallPair =
SPSTuple<SPSWrapperFunctionCall, SPSWrapperFunctionCall>;
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h
index a4a7fa4..a5f6c4f 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h
@@ -272,6 +272,9 @@ struct ExecutorAddrRange {
}
bool contains(ExecutorAddr Addr) const { return Start <= Addr && Addr < End; }
+ bool contains(const ExecutorAddrRange &Other) {
+ return (Other.Start >= Start && Other.End <= End);
+ }
bool overlaps(const ExecutorAddrRange &Other) {
return !(Other.End <= Start || End <= Other.Start);
}
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h
index 99ba456..d68a689 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h
@@ -29,8 +29,9 @@ LLVM_ABI extern const char *SimpleExecutorDylibManagerResolveWrapperName;
LLVM_ABI extern const char *SimpleExecutorMemoryManagerInstanceName;
LLVM_ABI extern const char *SimpleExecutorMemoryManagerReserveWrapperName;
-LLVM_ABI extern const char *SimpleExecutorMemoryManagerFinalizeWrapperName;
-LLVM_ABI extern const char *SimpleExecutorMemoryManagerDeallocateWrapperName;
+LLVM_ABI extern const char *SimpleExecutorMemoryManagerInitializeWrapperName;
+LLVM_ABI extern const char *SimpleExecutorMemoryManagerDeinitializeWrapperName;
+LLVM_ABI extern const char *SimpleExecutorMemoryManagerReleaseWrapperName;
LLVM_ABI extern const char *ExecutorSharedMemoryMapperServiceInstanceName;
LLVM_ABI extern const char *ExecutorSharedMemoryMapperServiceReserveWrapperName;
@@ -73,9 +74,12 @@ using SPSSimpleExecutorDylibManagerResolveSignature = shared::SPSExpected<
using SPSSimpleExecutorMemoryManagerReserveSignature =
shared::SPSExpected<shared::SPSExecutorAddr>(shared::SPSExecutorAddr,
uint64_t);
-using SPSSimpleExecutorMemoryManagerFinalizeSignature =
- shared::SPSError(shared::SPSExecutorAddr, shared::SPSFinalizeRequest);
-using SPSSimpleExecutorMemoryManagerDeallocateSignature = shared::SPSError(
+using SPSSimpleExecutorMemoryManagerInitializeSignature =
+ shared::SPSExpected<shared::SPSExecutorAddr>(shared::SPSExecutorAddr,
+ shared::SPSFinalizeRequest);
+using SPSSimpleExecutorMemoryManagerDeinitializeSignature = shared::SPSError(
+ shared::SPSExecutorAddr, shared::SPSSequence<shared::SPSExecutorAddr>);
+using SPSSimpleExecutorMemoryManagerReleaseSignature = shared::SPSError(
shared::SPSExecutorAddr, shared::SPSSequence<shared::SPSExecutorAddr>);
// ExecutorSharedMemoryMapperService
@@ -93,6 +97,18 @@ using SPSExecutorSharedMemoryMapperServiceDeinitializeSignature =
using SPSExecutorSharedMemoryMapperServiceReleaseSignature = shared::SPSError(
shared::SPSExecutorAddr, shared::SPSSequence<shared::SPSExecutorAddr>);
+// SimpleNativeMemoryMap APIs.
+using SPSSimpleRemoteMemoryMapReserveSignature =
+ shared::SPSExpected<shared::SPSExecutorAddr>(shared::SPSExecutorAddr,
+ uint64_t);
+using SPSSimpleRemoteMemoryMapInitializeSignature =
+ shared::SPSExpected<shared::SPSExecutorAddr>(shared::SPSExecutorAddr,
+ shared::SPSFinalizeRequest);
+using SPSSimpleRemoteMemoryMapDeinitializeSignature = shared::SPSError(
+ shared::SPSExecutorAddr, shared::SPSSequence<shared::SPSExecutorAddr>);
+using SPSSimpleRemoteMemoryMapReleaseSignature = shared::SPSError(
+ shared::SPSExecutorAddr, shared::SPSSequence<shared::SPSExecutorAddr>);
+
using SPSRunAsMainSignature = int64_t(shared::SPSExecutorAddr,
shared::SPSSequence<shared::SPSString>);
using SPSRunAsVoidFunctionSignature = int32_t(shared::SPSExecutorAddr);
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.h b/llvm/include/llvm/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.h
new file mode 100644
index 0000000..644c4f61
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.h
@@ -0,0 +1,87 @@
+//===- SimpleRemoteMemoryMapper.h - Remote memory mapper --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A simple memory mapper that uses EPC calls to implement reserve, initialize,
+// deinitialize, and release.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_SIMPLEREMOTEMEMORYMAPPER_H
+#define LLVM_EXECUTIONENGINE_ORC_SIMPLEREMOTEMEMORYMAPPER_H
+
+#include "llvm/ExecutionEngine/Orc/MemoryMapper.h"
+
+namespace llvm::orc {
+
+/// Manages remote memory by making SPS-based EPC calls.
+class LLVM_ABI SimpleRemoteMemoryMapper final : public MemoryMapper {
+public:
+ struct SymbolAddrs {
+ ExecutorAddr Instance;
+ ExecutorAddr Reserve;
+ ExecutorAddr Initialize;
+ ExecutorAddr Deinitialize;
+ ExecutorAddr Release;
+ };
+
+ SimpleRemoteMemoryMapper(ExecutorProcessControl &EPC, SymbolAddrs SAs);
+
+ static Expected<std::unique_ptr<SimpleRemoteMemoryMapper>>
+ Create(ExecutorProcessControl &EPC, SymbolAddrs SAs) {
+ return std::make_unique<SimpleRemoteMemoryMapper>(EPC, SAs);
+ }
+
+ unsigned int getPageSize() override { return EPC.getPageSize(); }
+
+ /// Reserves memory in the remote process by calling a remote
+ /// SPS-wrapper-function with signature
+ ///
+ /// SPSExpected<SPSExecutorAddr>(uint64_t Size).
+ ///
+ /// On success, returns the base address of the reserved range.
+ void reserve(size_t NumBytes, OnReservedFunction OnReserved) override;
+
+ char *prepare(jitlink::LinkGraph &G, ExecutorAddr Addr,
+ size_t ContentSize) override;
+
+ /// Initializes memory within a previously reserved region (applying
+ /// protections and running any finalization actions) by calling a remote
+ /// SPS-wrapper-function with signature
+ ///
+ /// SPSExpected<SPSExecutorAddr>(SPSFinalizeRequest)
+ ///
+ /// On success, returns a key that can be used to deinitialize the region.
+ void initialize(AllocInfo &AI, OnInitializedFunction OnInitialized) override;
+
+ /// Given a series of keys from previous initialize calls, deinitialize
+ /// previously initialized memory regions (running dealloc actions, resetting
+ /// permissions and decommitting if possible) by calling a remote
+ /// SPS-wrapper-function with signature
+ ///
+ /// SPSError(SPSSequence<SPSExecutorAddr> Keys)
+ ///
+ void deinitialize(ArrayRef<ExecutorAddr> Allocations,
+ OnDeinitializedFunction OnDeInitialized) override;
+
+ /// Given a sequence of base addresses from previous reserve calls, release
+ /// the underlying ranges (deinitializing any remaining regions within them)
+ /// by calling a remote SPS-wrapper-function with signature
+ ///
+ /// SPSError(SPSSequence<SPSExecutorAddr> Bases)
+ ///
+ void release(ArrayRef<ExecutorAddr> Reservations,
+ OnReleasedFunction OnRelease) override;
+
+private:
+ ExecutorProcessControl &EPC;
+ SymbolAddrs SAs;
+};
+
+} // namespace llvm::orc
+
+#endif // LLVM_EXECUTIONENGINE_ORC_SIMPLEREMOTEMEMORYMAPPER_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.h
index 741f203..6224e92 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.h
@@ -34,34 +34,65 @@ class LLVM_ABI SimpleExecutorMemoryManager : public ExecutorBootstrapService {
public:
virtual ~SimpleExecutorMemoryManager();
- Expected<ExecutorAddr> allocate(uint64_t Size);
- Error finalize(tpctypes::FinalizeRequest &FR);
- Error deallocate(const std::vector<ExecutorAddr> &Bases);
+ Expected<ExecutorAddr> reserve(uint64_t Size);
+ Expected<ExecutorAddr> initialize(tpctypes::FinalizeRequest &FR);
+ Error deinitialize(const std::vector<ExecutorAddr> &InitKeys);
+ Error release(const std::vector<ExecutorAddr> &Bases);
Error shutdown() override;
void addBootstrapSymbols(StringMap<ExecutorAddr> &M) override;
private:
- struct Allocation {
+ struct RegionInfo {
size_t Size = 0;
- std::vector<shared::WrapperFunctionCall> DeallocationActions;
+ std::vector<shared::WrapperFunctionCall> DeallocActions;
};
- using AllocationsMap = DenseMap<void *, Allocation>;
+ struct SlabInfo {
+ using RegionMap = std::map<ExecutorAddr, RegionInfo>;
+ size_t Size = 0;
+ RegionMap Regions;
+ };
+
+ using SlabMap = std::map<void *, SlabInfo>;
+
+ /// Get a reference to the slab information for the slab containing the given
+ /// address.
+ Expected<SlabInfo &> getSlabInfo(ExecutorAddr A, StringRef Context);
+
+ /// Get a reference to the slab information for the slab *covering* the given
+ /// range. The given range must be a subrange of e(possibly equal to) the
+ /// range of the slab itself.
+ Expected<SlabInfo &> getSlabInfo(ExecutorAddrRange R, StringRef Context);
- Error deallocateImpl(void *Base, Allocation &A);
+ /// Create a RegionInfo for the given range, which must not overlap any
+ /// existing region.
+ Expected<RegionInfo &> createRegionInfo(ExecutorAddrRange R,
+ StringRef Context);
+
+ /// Get a reference to the region information for the given address. This
+ /// address must represent the start of an existing initialized region.
+ Expected<RegionInfo &> getRegionInfo(SlabInfo &Slab, ExecutorAddr A,
+ StringRef Context);
+
+ /// Get a reference to the region information for the given address. This
+ /// address must represent the start of an existing initialized region.
+ Expected<RegionInfo &> getRegionInfo(ExecutorAddr A, StringRef Context);
static llvm::orc::shared::CWrapperFunctionResult
reserveWrapper(const char *ArgData, size_t ArgSize);
static llvm::orc::shared::CWrapperFunctionResult
- finalizeWrapper(const char *ArgData, size_t ArgSize);
+ initializeWrapper(const char *ArgData, size_t ArgSize);
+
+ static llvm::orc::shared::CWrapperFunctionResult
+ deinitializeWrapper(const char *ArgData, size_t ArgSize);
static llvm::orc::shared::CWrapperFunctionResult
- deallocateWrapper(const char *ArgData, size_t ArgSize);
+ releaseWrapper(const char *ArgData, size_t ArgSize);
std::mutex M;
- AllocationsMap Allocations;
+ SlabMap Slabs;
};
} // end namespace rt_bootstrap
diff --git a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
index db781b58..1a01fa6 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
@@ -571,7 +571,9 @@ struct DoacrossT {
// V5.2: [8.2.1] `requirement` clauses
template <typename T, typename I, typename E> //
struct DynamicAllocatorsT {
- using EmptyTrait = std::true_type;
+ using Requires = E;
+ using WrapperTrait = std::true_type;
+ OPT(Requires) v;
};
template <typename T, typename I, typename E> //
@@ -802,6 +804,7 @@ template <typename T, typename I, typename E> //
struct MapT {
using LocatorList = ObjectListT<I, E>;
ENUM(MapType, To, From, Tofrom, Storage);
+ ENUM(AttachModifier, Always, Auto, Never);
ENUM(MapTypeModifier, Always, Close, Delete, Present, Self, OmpxHold);
ENUM(RefModifier, RefPtee, RefPtr, RefPtrPtee);
// See note at the definition of the MapperT type.
@@ -810,8 +813,8 @@ struct MapT {
using MapTypeModifiers = ListT<MapTypeModifier>; // Not a spec name
using TupleTrait = std::true_type;
- std::tuple<OPT(MapType), OPT(MapTypeModifiers), OPT(RefModifier),
- OPT(Mappers), OPT(Iterator), LocatorList>
+ std::tuple<OPT(MapType), OPT(MapTypeModifiers), OPT(AttachModifier),
+ OPT(RefModifier), OPT(Mappers), OPT(Iterator), LocatorList>
t;
};
@@ -1055,7 +1058,9 @@ struct ReplayableT {
// V5.2: [8.2.1] `requirement` clauses
template <typename T, typename I, typename E> //
struct ReverseOffloadT {
- using EmptyTrait = std::true_type;
+ using Requires = E;
+ using WrapperTrait = std::true_type;
+ OPT(Requires) v;
};
// V5.2: [10.4.2] `safelen` clause
@@ -1077,6 +1082,14 @@ struct ScheduleT {
std::tuple<Kind, OPT(OrderingModifier), OPT(ChunkModifier), OPT(ChunkSize)> t;
};
+// [6.0:361]
+template <typename T, typename I, typename E> //
+struct SelfMapsT {
+ using Requires = E;
+ using WrapperTrait = std::true_type;
+ OPT(Requires) v;
+};
+
// V5.2: [15.8.1] Memory-order clauses
template <typename T, typename I, typename E> //
struct SeqCstT {
@@ -1168,18 +1181,17 @@ struct TransparentT {
// V5.2: [8.2.1] `requirement` clauses
template <typename T, typename I, typename E> //
struct UnifiedAddressT {
- using EmptyTrait = std::true_type;
+ using Requires = E;
+ using WrapperTrait = std::true_type;
+ OPT(Requires) v;
};
// V5.2: [8.2.1] `requirement` clauses
template <typename T, typename I, typename E> //
struct UnifiedSharedMemoryT {
- using EmptyTrait = std::true_type;
-};
-
-template <typename T, typename I, typename E> //
-struct SelfMapsT {
- using EmptyTrait = std::true_type;
+ using Requires = E;
+ using WrapperTrait = std::true_type;
+ OPT(Requires) v;
};
// V5.2: [5.10] `uniform` clause
@@ -1287,14 +1299,12 @@ using ExtensionClausesT =
template <typename T, typename I, typename E>
using EmptyClausesT = std::variant<
AcqRelT<T, I, E>, AcquireT<T, I, E>, CaptureT<T, I, E>, CompareT<T, I, E>,
- DynamicAllocatorsT<T, I, E>, FullT<T, I, E>, InbranchT<T, I, E>,
- MergeableT<T, I, E>, NogroupT<T, I, E>, NoOpenmpRoutinesT<T, I, E>,
+ FullT<T, I, E>, InbranchT<T, I, E>, MergeableT<T, I, E>, NogroupT<T, I, E>,
+ NoOpenmpConstructsT<T, I, E>, NoOpenmpRoutinesT<T, I, E>,
NoOpenmpT<T, I, E>, NoParallelismT<T, I, E>, NotinbranchT<T, I, E>,
NowaitT<T, I, E>, ReadT<T, I, E>, RelaxedT<T, I, E>, ReleaseT<T, I, E>,
- ReverseOffloadT<T, I, E>, SeqCstT<T, I, E>, SimdT<T, I, E>,
- ThreadsT<T, I, E>, UnifiedAddressT<T, I, E>, UnifiedSharedMemoryT<T, I, E>,
- UnknownT<T, I, E>, UntiedT<T, I, E>, UseT<T, I, E>, WeakT<T, I, E>,
- WriteT<T, I, E>, NoOpenmpConstructsT<T, I, E>, SelfMapsT<T, I, E>>;
+ SeqCstT<T, I, E>, SimdT<T, I, E>, ThreadsT<T, I, E>, UnknownT<T, I, E>,
+ UntiedT<T, I, E>, UseT<T, I, E>, WeakT<T, I, E>, WriteT<T, I, E>>;
template <typename T, typename I, typename E>
using IncompleteClausesT =
@@ -1322,18 +1332,20 @@ using WrapperClausesT = std::variant<
AtomicDefaultMemOrderT<T, I, E>, AtT<T, I, E>, BindT<T, I, E>,
CollapseT<T, I, E>, ContainsT<T, I, E>, CopyinT<T, I, E>,
CopyprivateT<T, I, E>, DefaultT<T, I, E>, DestroyT<T, I, E>,
- DetachT<T, I, E>, DeviceTypeT<T, I, E>, EnterT<T, I, E>,
- ExclusiveT<T, I, E>, FailT<T, I, E>, FilterT<T, I, E>, FinalT<T, I, E>,
- FirstprivateT<T, I, E>, HasDeviceAddrT<T, I, E>, HintT<T, I, E>,
- HoldsT<T, I, E>, InclusiveT<T, I, E>, IndirectT<T, I, E>,
+ DetachT<T, I, E>, DeviceTypeT<T, I, E>, DynamicAllocatorsT<T, I, E>,
+ EnterT<T, I, E>, ExclusiveT<T, I, E>, FailT<T, I, E>, FilterT<T, I, E>,
+ FinalT<T, I, E>, FirstprivateT<T, I, E>, HasDeviceAddrT<T, I, E>,
+ HintT<T, I, E>, HoldsT<T, I, E>, InclusiveT<T, I, E>, IndirectT<T, I, E>,
InitializerT<T, I, E>, IsDevicePtrT<T, I, E>, LinkT<T, I, E>,
MessageT<T, I, E>, NocontextT<T, I, E>, NontemporalT<T, I, E>,
NovariantsT<T, I, E>, NumTeamsT<T, I, E>, NumThreadsT<T, I, E>,
OrderedT<T, I, E>, PartialT<T, I, E>, PriorityT<T, I, E>, PrivateT<T, I, E>,
- ProcBindT<T, I, E>, SafelenT<T, I, E>, SeverityT<T, I, E>, SharedT<T, I, E>,
- SimdlenT<T, I, E>, SizesT<T, I, E>, PermutationT<T, I, E>,
- ThreadLimitT<T, I, E>, UniformT<T, I, E>, UpdateT<T, I, E>,
- UseDeviceAddrT<T, I, E>, UseDevicePtrT<T, I, E>, UsesAllocatorsT<T, I, E>>;
+ ProcBindT<T, I, E>, ReverseOffloadT<T, I, E>, SafelenT<T, I, E>,
+ SelfMapsT<T, I, E>, SeverityT<T, I, E>, SharedT<T, I, E>, SimdlenT<T, I, E>,
+ SizesT<T, I, E>, PermutationT<T, I, E>, ThreadLimitT<T, I, E>,
+ UnifiedAddressT<T, I, E>, UnifiedSharedMemoryT<T, I, E>, UniformT<T, I, E>,
+ UpdateT<T, I, E>, UseDeviceAddrT<T, I, E>, UseDevicePtrT<T, I, E>,
+ UsesAllocatorsT<T, I, E>>;
template <typename T, typename I, typename E>
using UnionOfAllClausesT = typename type::Union< //
diff --git a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
index 047baa3..6d6eb5cd 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
@@ -708,6 +708,7 @@ bool ConstructDecompositionT<C, H>::applyClause(
tomp::clause::MapT<TypeTy, IdTy, ExprTy>{
{/*MapType=*/MapType::Tofrom,
/*MapTypeModifier=*/std::nullopt,
+ /*AttachModifier=*/std::nullopt,
/*RefModifier=*/std::nullopt,
/*Mapper=*/std::nullopt, /*Iterator=*/std::nullopt,
/*LocatorList=*/std::move(tofrom)}});
@@ -970,8 +971,9 @@ bool ConstructDecompositionT<C, H>::applyClause(
llvm::omp::Clause::OMPC_map,
tomp::clause::MapT<TypeTy, IdTy, ExprTy>{
{/*MapType=*/MapType::Tofrom, /*MapTypeModifier=*/std::nullopt,
- /*RefModifier=*/std::nullopt, /*Mapper=*/std::nullopt,
- /*Iterator=*/std::nullopt, /*LocatorList=*/std::move(tofrom)}});
+ /*AttachModifier=*/std::nullopt, /*RefModifier=*/std::nullopt,
+ /*Mapper=*/std::nullopt, /*Iterator=*/std::nullopt,
+ /*LocatorList=*/std::move(tofrom)}});
dirTarget->clauses.push_back(map);
applied = true;
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index bba0d6e..edcf7a9 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -177,6 +177,8 @@ def OMPC_Doacross : Clause<[Spelling<"doacross">]> {
}
def OMPC_DynamicAllocators : Clause<[Spelling<"dynamic_allocators">]> {
let clangClass = "OMPDynamicAllocatorsClause";
+ let flangClass = "OmpDynamicAllocatorsClause";
+ let isValueOptional = true;
}
def OMPC_DynGroupprivate : Clause<[Spelling<"dyn_groupprivate">]> {
let flangClass = "OmpDynGroupprivateClause";
@@ -353,6 +355,7 @@ def OMPC_Novariants : Clause<[Spelling<"novariants">]> {
}
def OMPC_NoWait : Clause<[Spelling<"nowait">]> {
let clangClass = "OMPNowaitClause";
+ let isValueOptional = true;
}
def OMP_NUMTASKS_Strict : EnumVal<"strict", 1, 1> {}
def OMP_NUMTASKS_Unknown : EnumVal<"unknown", 2, 0> { let isDefault = 1; }
@@ -466,6 +469,8 @@ def OMPC_Replayable : Clause<[Spelling<"replayable">]> {
}
def OMPC_ReverseOffload : Clause<[Spelling<"reverse_offload">]> {
let clangClass = "OMPReverseOffloadClause";
+ let flangClass = "OmpReverseOffloadClause";
+ let isValueOptional = true;
}
def OMPC_SafeLen : Clause<[Spelling<"safelen">]> {
let clangClass = "OMPSafelenClause";
@@ -540,12 +545,18 @@ def OMPC_Transparent : Clause<[Spelling<"transparent">]> {
}
def OMPC_UnifiedAddress : Clause<[Spelling<"unified_address">]> {
let clangClass = "OMPUnifiedAddressClause";
+ let flangClass = "OmpUnifiedAddressClause";
+ let isValueOptional = true;
}
def OMPC_UnifiedSharedMemory : Clause<[Spelling<"unified_shared_memory">]> {
let clangClass = "OMPUnifiedSharedMemoryClause";
+ let flangClass = "OmpUnifiedSharedMemoryClause";
+ let isValueOptional = true;
}
def OMPC_SelfMaps : Clause<[Spelling<"self_maps">]> {
let clangClass = "OMPSelfMapsClause";
+ let flangClass = "OmpSelfMapsClause";
+ let isValueOptional = true;
}
def OMPC_Uniform : Clause<[Spelling<"uniform">]> {
let flangClass = "Name";
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index 1694a33..46b3d53 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -472,7 +472,7 @@ __OMP_RTL(__kmpc_target_init, false, Int32, KernelEnvironmentPtr, KernelLaunchEn
__OMP_RTL(__kmpc_target_deinit, false, Void,)
__OMP_RTL(__kmpc_kernel_prepare_parallel, false, Void, VoidPtr)
__OMP_RTL(__kmpc_parallel_51, false, Void, IdentPtr, Int32, Int32, Int32, Int32,
- FuncPtrTy, VoidPtr, VoidPtrPtr, SizeTy)
+ FuncPtrTy, FuncPtrTy, VoidPtrPtr, SizeTy)
__OMP_RTL(__kmpc_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int8)
__OMP_RTL(__kmpc_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int8)
__OMP_RTL(__kmpc_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int8)
diff --git a/llvm/include/llvm/IR/CFG.h b/llvm/include/llvm/IR/CFG.h
index 7c7e988..96d3b2f 100644
--- a/llvm/include/llvm/IR/CFG.h
+++ b/llvm/include/llvm/IR/CFG.h
@@ -42,9 +42,9 @@ template <class Ptr, class USE_iterator> // Predecessor Iterator
class PredIterator {
public:
using iterator_category = std::forward_iterator_tag;
- using value_type = Ptr;
+ using value_type = Ptr *;
using difference_type = std::ptrdiff_t;
- using pointer = Ptr *;
+ using pointer = Ptr **;
using reference = Ptr *;
protected:
@@ -141,7 +141,8 @@ class SuccIterator
std::random_access_iterator_tag, BlockT, int,
BlockT *, BlockT *> {
public:
- using difference_type = int;
+ using value_type = BlockT *;
+ using difference_type = std::ptrdiff_t;
using pointer = BlockT *;
using reference = BlockT *;
diff --git a/llvm/include/llvm/IR/ConstantFPRange.h b/llvm/include/llvm/IR/ConstantFPRange.h
index 39dc7c1..e772095 100644
--- a/llvm/include/llvm/IR/ConstantFPRange.h
+++ b/llvm/include/llvm/IR/ConstantFPRange.h
@@ -230,6 +230,19 @@ public:
/// Return a new range representing the possible values resulting
/// from a subtraction of a value in this range and a value in \p Other.
LLVM_ABI ConstantFPRange sub(const ConstantFPRange &Other) const;
+
+ /// Return a new range representing the possible values resulting
+ /// from a multiplication of a value in this range and a value in \p Other.
+ LLVM_ABI ConstantFPRange mul(const ConstantFPRange &Other) const;
+
+ /// Return a new range representing the possible values resulting
+ /// from a division of a value in this range and a value in
+ /// \p Other.
+ LLVM_ABI ConstantFPRange div(const ConstantFPRange &Other) const;
+
+ /// Flush denormal values to zero according to the specified mode.
+ /// For dynamic mode, we return the union of all possible results.
+ LLVM_ABI void flushDenormals(DenormalMode::DenormalModeKind Mode);
};
inline raw_ostream &operator<<(raw_ostream &OS, const ConstantFPRange &CR) {
diff --git a/llvm/include/llvm/IR/DebugProgramInstruction.h b/llvm/include/llvm/IR/DebugProgramInstruction.h
index e0292c2..457c60e3b 100644
--- a/llvm/include/llvm/IR/DebugProgramInstruction.h
+++ b/llvm/include/llvm/IR/DebugProgramInstruction.h
@@ -14,7 +14,7 @@
// dbg.value(metadata i32 %foo, ...)
// %bar = void call @ext(%foo);
//
-// and all information is stored in the Value / Metadata hierachy defined
+// and all information is stored in the Value / Metadata hierarchy defined
// elsewhere in LLVM. In the "DbgRecord" design, each instruction /may/ have a
// connection with a DbgMarker, which identifies a position immediately before
// the instruction, and each DbgMarker /may/ then have connections to DbgRecords
@@ -37,7 +37,7 @@
//
// This structure separates the two concerns of the position of the debug-info
// in the function, and the Value that it refers to. It also creates a new
-// "place" in-between the Value / Metadata hierachy where we can customise
+// "place" in-between the Value / Metadata hierarchy where we can customise
// storage and allocation techniques to better suite debug-info workloads.
// NB: as of the initial prototype, none of that has actually been attempted
// yet.
@@ -162,7 +162,7 @@ public:
LLVM_ABI bool isIdenticalToWhenDefined(const DbgRecord &R) const;
/// Convert this DbgRecord back into an appropriate llvm.dbg.* intrinsic.
/// \p InsertBefore Optional position to insert this intrinsic.
- /// \returns A new llvm.dbg.* intrinsic representiung this DbgRecord.
+ /// \returns A new llvm.dbg.* intrinsic representing this DbgRecord.
LLVM_ABI DbgInfoIntrinsic *
createDebugIntrinsic(Module *M, Instruction *InsertBefore) const;
///@}
@@ -530,7 +530,7 @@ public:
LLVM_ABI void setKillAddress();
/// Check whether this kills the address component. This doesn't take into
/// account the position of the intrinsic, therefore a returned value of false
- /// does not guarentee the address is a valid location for the variable at the
+ /// does not guarantee the address is a valid location for the variable at the
/// intrinsic's position in IR.
LLVM_ABI bool isKillAddress() const;
@@ -539,7 +539,7 @@ public:
LLVM_ABI DbgVariableRecord *clone() const;
/// Convert this DbgVariableRecord back into a dbg.value intrinsic.
/// \p InsertBefore Optional position to insert this intrinsic.
- /// \returns A new dbg.value intrinsic representiung this DbgVariableRecord.
+ /// \returns A new dbg.value intrinsic representing this DbgVariableRecord.
LLVM_ABI DbgVariableIntrinsic *
createDebugIntrinsic(Module *M, Instruction *InsertBefore) const;
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index 041a4ce..dacda0a 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -2548,6 +2548,11 @@ public:
std::optional<RoundingMode> Rounding = std::nullopt,
std::optional<fp::ExceptionBehavior> Except = std::nullopt);
+ LLVM_ABI Value *CreateSelectWithUnknownProfile(Value *C, Value *True,
+ Value *False,
+ StringRef PassName,
+ const Twine &Name = "");
+
LLVM_ABI Value *CreateSelect(Value *C, Value *True, Value *False,
const Twine &Name = "",
Instruction *MDFrom = nullptr);
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index 570d6bc..3b7077c 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -77,6 +77,9 @@ def int_dx_resource_updatecounter
: DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_any_ty, llvm_i8_ty],
[IntrInaccessibleMemOrArgMemOnly]>;
+def int_dx_resource_getdimensions_x
+ : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_any_ty], [IntrReadMem]>;
+
// Cast between target extension handle types and dxil-style opaque handles
def int_dx_resource_casthandle : Intrinsic<[llvm_any_ty], [llvm_any_ty]>;
diff --git a/llvm/include/llvm/IR/IntrinsicsRISCVXsf.td b/llvm/include/llvm/IR/IntrinsicsRISCVXsf.td
index bf20080..4a0272c 100644
--- a/llvm/include/llvm/IR/IntrinsicsRISCVXsf.td
+++ b/llvm/include/llvm/IR/IntrinsicsRISCVXsf.td
@@ -180,4 +180,98 @@ let TargetPrefix = "riscv" in {
// XSfvfnrclipxfqf
defm int_riscv_sf_vfnrclip_x_f_qf : RISCVSFCustomVFNRCLIP;
defm int_riscv_sf_vfnrclip_xu_f_qf : RISCVSFCustomVFNRCLIP;
+
+ // XSfmm
+ // Output: (output_len)
+ // Input: (input_len, vsew, twiden)
+ class RISCVSFVSet
+ : DefaultAttrsIntrinsic<[llvm_anyint_ty],
+ [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
+ [ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, IntrNoMem]>;
+
+ // Input: (tss, base, tn)
+ class RISCVSFTileLoad
+ : DefaultAttrsIntrinsic<[],
+ [llvm_anyint_ty, llvm_ptr_ty, LLVMMatchType<0>],
+ [NoCapture<ArgIndex<1>>, IntrHasSideEffects]>,
+ RISCVVIntrinsic;
+
+ // Input: (tss, base, tn)
+ class RISCVSFTileStore
+ : DefaultAttrsIntrinsic<[],
+ [llvm_anyint_ty, llvm_ptr_ty, LLVMMatchType<0>],
+ [NoCapture<ArgIndex<1>>, IntrWriteMem,
+ IntrHasSideEffects]>,
+ RISCVVIntrinsic;
+
+ // Output: ()
+ // Input: (mtd, mat1, mat2, tm, tn, tk, twiden)
+ class RISCVSFCustomMatMul<bit is_float = false>
+ : DefaultAttrsIntrinsic<[], [llvm_anyint_ty, llvm_anyvector_ty,
+ !if(is_float, LLVMMatchType<1>,
+ llvm_anyvector_ty),
+ LLVMMatchType<0>, LLVMMatchType<0>,
+ LLVMMatchType<0>, LLVMMatchType<0>],
+ [IntrNoMem, IntrHasSideEffects,
+ ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<6>>]>,
+ RISCVVIntrinsic;
+
+ def int_riscv_sf_vsettnt : RISCVSFVSet;
+ def int_riscv_sf_vsettm : RISCVSFVSet;
+ def int_riscv_sf_vsettk : RISCVSFVSet;
+
+ def int_riscv_sf_vlte8 : RISCVSFTileLoad;
+ def int_riscv_sf_vlte16 : RISCVSFTileLoad;
+ def int_riscv_sf_vlte32 : RISCVSFTileLoad;
+ def int_riscv_sf_vlte64 : RISCVSFTileLoad;
+ def int_riscv_sf_vste8 : RISCVSFTileStore;
+ def int_riscv_sf_vste16 : RISCVSFTileStore;
+ def int_riscv_sf_vste32 : RISCVSFTileStore;
+ def int_riscv_sf_vste64 : RISCVSFTileStore;
+
+ // Output: (vd)
+ // Input: (tss, tn)
+ def int_riscv_sf_vtmv_v_t
+ : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+ [llvm_anyint_ty, LLVMMatchType<1>],
+ [IntrNoMem, IntrHasSideEffects]>,
+ RISCVVIntrinsic {
+ let VLOperand = 2;
+ }
+ // Output: ()
+ // Input: (tss, vs2, tn)
+ def int_riscv_sf_vtmv_t_v
+ : DefaultAttrsIntrinsic<[], [LLVMMatchType<1>, llvm_anyvector_ty,
+ llvm_anyint_ty], [IntrNoMem, IntrHasSideEffects]>,
+ RISCVVIntrinsic {
+ let VLOperand = 2;
+ }
+
+ foreach a = ["u", "s"] in {
+ foreach b = ["u", "s"] in {
+ def int_riscv_sf_mm_ # a # _ # b : RISCVSFCustomMatMul;
+ }
+ }
+
+ def int_riscv_sf_mm_f_f : RISCVSFCustomMatMul<true>;
+ foreach e1 = [5, 4] in
+ foreach e2 = [5, 4] in
+ def int_riscv_sf_mm_e # e1 # m # !sub(7, e1) # _e # e2 # m # !sub(7, e2)
+ : RISCVSFCustomMatMul<true>;
+
+ // Output: ()
+ // Input: (mtd)
+ def int_riscv_sf_vtzero_t
+ : DefaultAttrsIntrinsic<[],
+ [llvm_anyint_ty, LLVMMatchType<0>,LLVMMatchType<0>,
+ LLVMMatchType<0>, LLVMMatchType<0>],
+ [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<3>>,
+ ImmArg<ArgIndex<4>>, IntrNoMem, IntrHasSideEffects]>,
+ RISCVVIntrinsic;
+
+ // Output: ()
+ // Input: ()
+ def int_riscv_sf_vtdiscard
+ : DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>,
+ RISCVVIntrinsic;
} // TargetPrefix = "riscv"
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index 66e24fa..49a182be 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -167,6 +167,9 @@ def int_spv_rsqrt : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty]
: DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_any_ty, llvm_i8_ty],
[IntrInaccessibleMemOrArgMemOnly]>;
+ def int_spv_resource_getdimensions_x
+ : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_any_ty], [IntrReadMem]>;
+
def int_spv_resource_getpointer
: DefaultAttrsIntrinsic<[llvm_anyptr_ty], [llvm_any_ty, llvm_i32_ty],
[IntrNoMem]>;
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td b/llvm/include/llvm/IR/RuntimeLibcalls.td
index 6183a7e..a8b647c 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.td
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.td
@@ -405,17 +405,19 @@ def MIPS16_RET_DF : RuntimeLibcall;
def MIPS16_RET_SC : RuntimeLibcall;
def MIPS16_RET_SF : RuntimeLibcall;
-multiclass LibmLongDoubleLibCall<string libcall_basename = !toupper(NAME),
- string rtbasename = NAME> {
+multiclass LibmLongDoubleLibCall<string libcall_basename = !toupper(!substr(NAME, 0, !sub(!size(NAME), 1))),
+ string rtname = NAME> {
+
+
def NAME#"_f128"
: RuntimeLibcallImpl<!cast<RuntimeLibcall>(libcall_basename#"_F128"),
- !strconcat(rtbasename, "l")>;
+ rtname>;
def NAME#"_ppcf128"
: RuntimeLibcallImpl<!cast<RuntimeLibcall>(libcall_basename#"_PPCF128"),
- !strconcat(rtbasename, "l")>;
+ rtname>;
def NAME#"_f80"
: RuntimeLibcallImpl<!cast<RuntimeLibcall>(libcall_basename#"_F80"),
- !strconcat(rtbasename, "l")>;
+ rtname>;
}
// AArch64 calls
@@ -765,19 +767,19 @@ def fmodl_ppc128 : RuntimeLibcallImpl<REM_PPCF128, "fmodl">;
def fmaf : RuntimeLibcallImpl<FMA_F32>;
def fma : RuntimeLibcallImpl<FMA_F64>;
-defm fma : LibmLongDoubleLibCall;
+defm fmal : LibmLongDoubleLibCall;
def sqrtf : RuntimeLibcallImpl<SQRT_F32>;
def sqrt : RuntimeLibcallImpl<SQRT_F64>;
-defm sqrt : LibmLongDoubleLibCall;
+defm sqrtl : LibmLongDoubleLibCall;
def cbrtf : RuntimeLibcallImpl<CBRT_F32>;
def cbrt : RuntimeLibcallImpl<CBRT_F64>;
-defm cbrt : LibmLongDoubleLibCall;
+defm cbrtl : LibmLongDoubleLibCall;
def logf : RuntimeLibcallImpl<LOG_F32>;
def log : RuntimeLibcallImpl<LOG_F64>;
-defm log : LibmLongDoubleLibCall;
+defm logl : LibmLongDoubleLibCall;
def __logf_finite : RuntimeLibcallImpl<LOG_FINITE_F32>;
def __log_finite : RuntimeLibcallImpl<LOG_FINITE_F64>;
@@ -787,7 +789,7 @@ def __logl_finite_ppcf128 : RuntimeLibcallImpl<LOG_FINITE_PPCF128, "__logl_finit
def log2f : RuntimeLibcallImpl<LOG2_F32>;
def log2 : RuntimeLibcallImpl<LOG2_F64>;
-defm log2 : LibmLongDoubleLibCall;
+defm log2l : LibmLongDoubleLibCall;
def __log2f_finite : RuntimeLibcallImpl<LOG2_FINITE_F32>;
def __log2_finite : RuntimeLibcallImpl<LOG2_FINITE_F64>;
@@ -797,7 +799,7 @@ def __log2l_finite_ppcf128 : RuntimeLibcallImpl<LOG2_FINITE_PPCF128, "__log2l_fi
def log10f : RuntimeLibcallImpl<LOG10_F32>;
def log10 : RuntimeLibcallImpl<LOG10_F64>;
-defm log10 : LibmLongDoubleLibCall;
+defm log10l : LibmLongDoubleLibCall;
def __log10f_finite : RuntimeLibcallImpl<LOG10_FINITE_F32>;
def __log10_finite : RuntimeLibcallImpl<LOG10_FINITE_F64>;
@@ -807,7 +809,7 @@ def __log10l_finite_ppcf128 : RuntimeLibcallImpl<LOG10_FINITE_PPCF128, "__log10l
def expf : RuntimeLibcallImpl<EXP_F32>;
def exp : RuntimeLibcallImpl<EXP_F64>;
-defm exp : LibmLongDoubleLibCall<"EXP", "exp">;
+defm expl : LibmLongDoubleLibCall<"EXP">;
def __expf_finite : RuntimeLibcallImpl<EXP_FINITE_F32>;
def __exp_finite : RuntimeLibcallImpl<EXP_FINITE_F64>;
@@ -817,7 +819,7 @@ def __expl_finite_ppcf128 : RuntimeLibcallImpl<EXP_FINITE_PPCF128, "__expl_finit
def exp2f : RuntimeLibcallImpl<EXP2_F32>;
def exp2 : RuntimeLibcallImpl<EXP2_F64>;
-defm exp2 : LibmLongDoubleLibCall<"EXP2", "exp2">;
+defm exp2l : LibmLongDoubleLibCall<"EXP2">;
def __exp2f_finite : RuntimeLibcallImpl<EXP2_FINITE_F32>;
def __exp2_finite : RuntimeLibcallImpl<EXP2_FINITE_F64>;
@@ -827,47 +829,47 @@ def __exp2l_finite_ppcf128 : RuntimeLibcallImpl<EXP2_FINITE_PPCF128, "__exp2l_fi
def sinf : RuntimeLibcallImpl<SIN_F32>;
def sin : RuntimeLibcallImpl<SIN_F64>;
-defm sin : LibmLongDoubleLibCall;
+defm sinl : LibmLongDoubleLibCall;
def cosf : RuntimeLibcallImpl<COS_F32>;
def cos : RuntimeLibcallImpl<COS_F64>;
-defm cos : LibmLongDoubleLibCall;
+defm cosl : LibmLongDoubleLibCall;
def tanf : RuntimeLibcallImpl<TAN_F32>;
def tan : RuntimeLibcallImpl<TAN_F64>;
-defm tan : LibmLongDoubleLibCall;
+defm tanl : LibmLongDoubleLibCall;
def sinhf : RuntimeLibcallImpl<SINH_F32>;
def sinh : RuntimeLibcallImpl<SINH_F64>;
-defm sinh : LibmLongDoubleLibCall;
+defm sinhl : LibmLongDoubleLibCall;
def coshf : RuntimeLibcallImpl<COSH_F32>;
def cosh : RuntimeLibcallImpl<COSH_F64>;
-defm cosh : LibmLongDoubleLibCall;
+defm coshl : LibmLongDoubleLibCall;
def tanhf : RuntimeLibcallImpl<TANH_F32>;
def tanh : RuntimeLibcallImpl<TANH_F64>;
-defm tanh : LibmLongDoubleLibCall;
+defm tanhl : LibmLongDoubleLibCall;
def asinf : RuntimeLibcallImpl<ASIN_F32>;
def asin : RuntimeLibcallImpl<ASIN_F64>;
-defm asin : LibmLongDoubleLibCall;
+defm asinl : LibmLongDoubleLibCall;
def acosf : RuntimeLibcallImpl<ACOS_F32>;
def acos : RuntimeLibcallImpl<ACOS_F64>;
-defm acos : LibmLongDoubleLibCall;
+defm acosl : LibmLongDoubleLibCall;
def atanf : RuntimeLibcallImpl<ATAN_F32>;
def atan : RuntimeLibcallImpl<ATAN_F64>;
-defm atan : LibmLongDoubleLibCall;
+defm atanl : LibmLongDoubleLibCall;
def atan2f : RuntimeLibcallImpl<ATAN2_F32>;
def atan2 : RuntimeLibcallImpl<ATAN2_F64>;
-defm atan2 : LibmLongDoubleLibCall;
+defm atan2l : LibmLongDoubleLibCall;
def powf : RuntimeLibcallImpl<POW_F32>;
def pow : RuntimeLibcallImpl<POW_F64>;
-defm pow : LibmLongDoubleLibCall;
+defm powl : LibmLongDoubleLibCall;
def __powf_finite : RuntimeLibcallImpl<POW_FINITE_F32>;
def __pow_finite : RuntimeLibcallImpl<POW_FINITE_F64>;
@@ -877,91 +879,91 @@ def __powl_finite_ppcf128 : RuntimeLibcallImpl<POW_FINITE_PPCF128, "__powl_finit
def ceilf : RuntimeLibcallImpl<CEIL_F32>;
def ceil : RuntimeLibcallImpl<CEIL_F64>;
-defm ceil : LibmLongDoubleLibCall;
+defm ceill : LibmLongDoubleLibCall;
def truncf : RuntimeLibcallImpl<TRUNC_F32>;
def trunc : RuntimeLibcallImpl<TRUNC_F64>;
-defm trunc : LibmLongDoubleLibCall;
+defm truncl : LibmLongDoubleLibCall;
def rintf : RuntimeLibcallImpl<RINT_F32>;
def rint : RuntimeLibcallImpl<RINT_F64>;
-defm rint : LibmLongDoubleLibCall;
+defm rintl : LibmLongDoubleLibCall;
def nearbyintf : RuntimeLibcallImpl<NEARBYINT_F32>;
def nearbyint : RuntimeLibcallImpl<NEARBYINT_F64>;
-defm nearbyint : LibmLongDoubleLibCall;
+defm nearbyintl : LibmLongDoubleLibCall;
def roundf : RuntimeLibcallImpl<ROUND_F32>;
def round : RuntimeLibcallImpl<ROUND_F64>;
-defm round : LibmLongDoubleLibCall;
+defm roundl : LibmLongDoubleLibCall;
def roundevenf : RuntimeLibcallImpl<ROUNDEVEN_F32>;
def roundeven : RuntimeLibcallImpl<ROUNDEVEN_F64>;
-defm roundeven : LibmLongDoubleLibCall;
+defm roundevenl : LibmLongDoubleLibCall;
def floorf : RuntimeLibcallImpl<FLOOR_F32>;
def floor : RuntimeLibcallImpl<FLOOR_F64>;
-defm floor : LibmLongDoubleLibCall;
+defm floorl : LibmLongDoubleLibCall;
def copysignf : RuntimeLibcallImpl<COPYSIGN_F32>;
def copysign : RuntimeLibcallImpl<COPYSIGN_F64>;
-defm copysign : LibmLongDoubleLibCall;
+defm copysignl : LibmLongDoubleLibCall;
def fminf : RuntimeLibcallImpl<FMIN_F32>;
def fmin : RuntimeLibcallImpl<FMIN_F64>;
-defm fmin : LibmLongDoubleLibCall;
+defm fminl : LibmLongDoubleLibCall;
def fmaxf : RuntimeLibcallImpl<FMAX_F32>;
def fmax : RuntimeLibcallImpl<FMAX_F64>;
-defm fmax : LibmLongDoubleLibCall;
+defm fmaxl : LibmLongDoubleLibCall;
def fminimumf : RuntimeLibcallImpl<FMINIMUM_F32>;
def fminimum : RuntimeLibcallImpl<FMINIMUM_F64>;
-defm fminimum : LibmLongDoubleLibCall;
+defm fminimuml : LibmLongDoubleLibCall;
def fmaximumf : RuntimeLibcallImpl<FMAXIMUM_F32>;
def fmaximum : RuntimeLibcallImpl<FMAXIMUM_F64>;
-defm fmaximum : LibmLongDoubleLibCall;
+defm fmaximuml : LibmLongDoubleLibCall;
def fminimum_numf : RuntimeLibcallImpl<FMINIMUM_NUM_F32>;
def fminimum_num : RuntimeLibcallImpl<FMINIMUM_NUM_F64>;
-defm fminimum_num : LibmLongDoubleLibCall;
+defm fminimum_numl : LibmLongDoubleLibCall;
def fmaximum_numf : RuntimeLibcallImpl<FMAXIMUM_NUM_F32>;
def fmaximum_num : RuntimeLibcallImpl<FMAXIMUM_NUM_F64>;
-defm fmaximum_num : LibmLongDoubleLibCall;
+defm fmaximum_numl : LibmLongDoubleLibCall;
def lroundf : RuntimeLibcallImpl<LROUND_F32>;
def lround : RuntimeLibcallImpl<LROUND_F64>;
-defm lround : LibmLongDoubleLibCall;
+defm lroundl : LibmLongDoubleLibCall;
def llroundf : RuntimeLibcallImpl<LLROUND_F32>;
def llround : RuntimeLibcallImpl<LLROUND_F64>;
-defm llround : LibmLongDoubleLibCall;
+defm llroundl : LibmLongDoubleLibCall;
def lrintf : RuntimeLibcallImpl<LRINT_F32>;
def lrint : RuntimeLibcallImpl<LRINT_F64>;
-defm lrint : LibmLongDoubleLibCall;
+defm lrintl : LibmLongDoubleLibCall;
def llrintf : RuntimeLibcallImpl<LLRINT_F32>;
def llrint : RuntimeLibcallImpl<LLRINT_F64>;
-defm llrint : LibmLongDoubleLibCall;
+defm llrintl : LibmLongDoubleLibCall;
def ldexpf : RuntimeLibcallImpl<LDEXP_F32>;
def ldexp : RuntimeLibcallImpl<LDEXP_F64>;
-defm ldexp : LibmLongDoubleLibCall;
+defm ldexpl : LibmLongDoubleLibCall;
def frexpf : RuntimeLibcallImpl<FREXP_F32>;
def frexp : RuntimeLibcallImpl<FREXP_F64>;
-defm frexp : LibmLongDoubleLibCall;
+defm frexpl : LibmLongDoubleLibCall;
def sincospif : RuntimeLibcallImpl<SINCOSPI_F32>;
def sincospi : RuntimeLibcallImpl<SINCOSPI_F64>;
-defm sincospi : LibmLongDoubleLibCall;
+defm sincospil : LibmLongDoubleLibCall;
def modff : RuntimeLibcallImpl<MODF_F32>;
def modf : RuntimeLibcallImpl<MODF_F64>;
-defm modf : LibmLongDoubleLibCall;
+defm modfl : LibmLongDoubleLibCall;
// Floating point environment
def fegetenv : RuntimeLibcallImpl<FEGETENV>;
@@ -1033,7 +1035,7 @@ def __sincos_stret : RuntimeLibcallImpl<SINCOS_STRET_F64>;
def sincosf : RuntimeLibcallImpl<SINCOS_F32>;
def sincos : RuntimeLibcallImpl<SINCOS_F64>;
-defm sincos : LibmLongDoubleLibCall;
+defm sincosl : LibmLongDoubleLibCall;
def bzero : RuntimeLibcallImpl<BZERO>;
def __bzero : RuntimeLibcallImpl<BZERO>;
@@ -1198,9 +1200,9 @@ defvar SecurityCheckCookieIfWinMSVC =
defvar LibmHasSinCosF32 = LibcallImpls<(add sincosf), hasSinCos>;
defvar LibmHasSinCosF64 = LibcallImpls<(add sincos), hasSinCos>;
-defvar LibmHasSinCosF80 = LibcallImpls<(add sincos_f80), hasSinCos>;
-defvar LibmHasSinCosF128 = LibcallImpls<(add sincos_f128), hasSinCos>;
-defvar LibmHasSinCosPPCF128 = LibcallImpls<(add sincos_ppcf128), hasSinCos>;
+defvar LibmHasSinCosF80 = LibcallImpls<(add sincosl_f80), hasSinCos>;
+defvar LibmHasSinCosF128 = LibcallImpls<(add sincosl_f128), hasSinCos>;
+defvar LibmHasSinCosPPCF128 = LibcallImpls<(add sincosl_ppcf128), hasSinCos>;
defvar LibmHasExp10F32 = LibcallImpls<(add exp10f), hasExp10>;
defvar LibmHasExp10F64 = LibcallImpls<(add exp10), hasExp10>;
@@ -1214,8 +1216,8 @@ defvar DefaultLibmExp10 = [
defvar WindowsMathRemovals = [
- ldexpf, ldexp_f80, ldexp_f128, ldexp_ppcf128,
- frexpf, frexp_f80, frexp_f128, frexp_ppcf128
+ ldexpf, ldexpl_f80, ldexpl_f128, ldexpl_ppcf128,
+ frexpf, frexpl_f80, frexpl_f128, frexpl_ppcf128
];
defvar MostPowI = !listremove(PowiLibcallImpls, [__powitf2_f128, __powitf2_ppc128]);
@@ -1233,11 +1235,11 @@ defvar WinDefaultLibcallImpls = (add WinDefaultLibcallImplsBaseList,
defvar LibmHasFrexpF32 = LibcallImpls<(add frexpf), isNotOSWindowsOrIsCygwinMinGW>;
defvar LibmHasLdexpF32 = LibcallImpls<(add ldexpf), isNotOSWindowsOrIsCygwinMinGW>;
-defvar LibmHasFrexpF80 = LibcallImpls<(add frexp_f80), isNotOSWindowsOrIsCygwinMinGW>;
-defvar LibmHasLdexpF80 = LibcallImpls<(add ldexp_f80), isNotOSWindowsOrIsCygwinMinGW>;
+defvar LibmHasFrexpF80 = LibcallImpls<(add frexpl_f80), isNotOSWindowsOrIsCygwinMinGW>;
+defvar LibmHasLdexpF80 = LibcallImpls<(add ldexpl_f80), isNotOSWindowsOrIsCygwinMinGW>;
-defvar LibmHasFrexpF128 = LibcallImpls<(add frexp_f128), isNotOSWindowsOrIsCygwinMinGW>;
-defvar LibmHasLdexpF128 = LibcallImpls<(add ldexp_f128), isNotOSWindowsOrIsCygwinMinGW>;
+defvar LibmHasFrexpF128 = LibcallImpls<(add frexpl_f128), isNotOSWindowsOrIsCygwinMinGW>;
+defvar LibmHasLdexpF128 = LibcallImpls<(add ldexpl_f128), isNotOSWindowsOrIsCygwinMinGW>;
defvar has__stack_chk_fail = LibcallImpls<(add __stack_chk_fail), isNotOSOpenBSD>;
defvar has__stack_chk_guard =
@@ -2459,7 +2461,7 @@ defvar X86CommonLibcalls =
LibcallImpls<(add __bzero), darwinHas__bzero>,
LibmHasFrexpF32, LibmHasLdexpF32,
LibmHasFrexpF80, LibmHasLdexpF80,
- LibcallImpls<(add frexp_f128, ldexp_f128, exp10l_f128), hasExpFrexplLdexplF128>,
+ LibcallImpls<(add frexpl_f128, ldexpl_f128, exp10l_f128), hasExpFrexplLdexplF128>,
DefaultRuntimeLibcallImpls_f80,
LibmHasExp10F32, LibmHasExp10F64, LibmHasExp10F80,
LibcallImpls<(add MostPowI), isNotOSMSVCRT>,
diff --git a/llvm/include/llvm/IR/Value.h b/llvm/include/llvm/IR/Value.h
index 04d0391..58822a0 100644
--- a/llvm/include/llvm/IR/Value.h
+++ b/llvm/include/llvm/IR/Value.h
@@ -484,8 +484,8 @@ public:
/// Remove every uses that can safely be removed.
///
/// This will remove for example uses in llvm.assume.
- /// This should be used when performing want to perform a tranformation but
- /// some Droppable uses pervent it.
+ /// This should be used when performing want to perform a transformation but
+ /// some Droppable uses prevent it.
/// This function optionally takes a filter to only remove some droppable
/// uses.
LLVM_ABI void
diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h
index 3a9a7f7..000472f 100644
--- a/llvm/include/llvm/LTO/LTO.h
+++ b/llvm/include/llvm/LTO/LTO.h
@@ -105,12 +105,6 @@ setupStatsFile(StringRef StatsFilename);
/// ordered indices to elements in the input array.
LLVM_ABI std::vector<int> generateModulesOrdering(ArrayRef<BitcodeModule *> R);
-/// Updates MemProf attributes (and metadata) based on whether the index
-/// has recorded that we are linking with allocation libraries containing
-/// the necessary APIs for downstream transformations.
-LLVM_ABI void updateMemProfAttributes(Module &Mod,
- const ModuleSummaryIndex &Index);
-
class LTO;
struct SymbolResolution;
diff --git a/llvm/include/llvm/Object/ELFTypes.h b/llvm/include/llvm/Object/ELFTypes.h
index 5a26e2f..e9a417d 100644
--- a/llvm/include/llvm/Object/ELFTypes.h
+++ b/llvm/include/llvm/Object/ELFTypes.h
@@ -833,6 +833,7 @@ struct BBAddrMap {
bool MultiBBRange : 1;
bool OmitBBEntries : 1;
bool CallsiteEndOffsets : 1;
+ bool BBHash : 1;
bool hasPGOAnalysis() const { return FuncEntryCount || BBFreq || BrProb; }
@@ -845,7 +846,8 @@ struct BBAddrMap {
(static_cast<uint8_t>(BrProb) << 2) |
(static_cast<uint8_t>(MultiBBRange) << 3) |
(static_cast<uint8_t>(OmitBBEntries) << 4) |
- (static_cast<uint8_t>(CallsiteEndOffsets) << 5);
+ (static_cast<uint8_t>(CallsiteEndOffsets) << 5) |
+ (static_cast<uint8_t>(BBHash) << 6);
}
// Decodes from minimum bit width representation and validates no
@@ -854,7 +856,8 @@ struct BBAddrMap {
Features Feat{
static_cast<bool>(Val & (1 << 0)), static_cast<bool>(Val & (1 << 1)),
static_cast<bool>(Val & (1 << 2)), static_cast<bool>(Val & (1 << 3)),
- static_cast<bool>(Val & (1 << 4)), static_cast<bool>(Val & (1 << 5))};
+ static_cast<bool>(Val & (1 << 4)), static_cast<bool>(Val & (1 << 5)),
+ static_cast<bool>(Val & (1 << 6))};
if (Feat.encode() != Val)
return createStringError(
std::error_code(), "invalid encoding for BBAddrMap::Features: 0x%x",
@@ -864,10 +867,10 @@ struct BBAddrMap {
bool operator==(const Features &Other) const {
return std::tie(FuncEntryCount, BBFreq, BrProb, MultiBBRange,
- OmitBBEntries, CallsiteEndOffsets) ==
+ OmitBBEntries, CallsiteEndOffsets, BBHash) ==
std::tie(Other.FuncEntryCount, Other.BBFreq, Other.BrProb,
Other.MultiBBRange, Other.OmitBBEntries,
- Other.CallsiteEndOffsets);
+ Other.CallsiteEndOffsets, Other.BBHash);
}
};
@@ -920,17 +923,19 @@ struct BBAddrMap {
false}; // Metdata for this basic block.
// Offsets of end of call instructions, relative to the basic block start.
SmallVector<uint32_t, 1> CallsiteEndOffsets;
+ uint64_t Hash = 0; // Hash for this basic block.
BBEntry(uint32_t ID, uint32_t Offset, uint32_t Size, Metadata MD,
- SmallVector<uint32_t, 1> CallsiteEndOffsets)
+ SmallVector<uint32_t, 1> CallsiteEndOffsets, uint64_t Hash)
: ID(ID), Offset(Offset), Size(Size), MD(MD),
- CallsiteEndOffsets(std::move(CallsiteEndOffsets)) {}
+ CallsiteEndOffsets(std::move(CallsiteEndOffsets)), Hash(Hash) {}
UniqueBBID getID() const { return {ID, 0}; }
bool operator==(const BBEntry &Other) const {
return ID == Other.ID && Offset == Other.Offset && Size == Other.Size &&
- MD == Other.MD && CallsiteEndOffsets == Other.CallsiteEndOffsets;
+ MD == Other.MD && CallsiteEndOffsets == Other.CallsiteEndOffsets &&
+ Hash == Other.Hash;
}
bool hasReturn() const { return MD.HasReturn; }
diff --git a/llvm/include/llvm/ObjectYAML/ELFYAML.h b/llvm/include/llvm/ObjectYAML/ELFYAML.h
index c90591d..a7c7c7c 100644
--- a/llvm/include/llvm/ObjectYAML/ELFYAML.h
+++ b/llvm/include/llvm/ObjectYAML/ELFYAML.h
@@ -163,6 +163,7 @@ struct BBAddrMapEntry {
llvm::yaml::Hex64 Size;
llvm::yaml::Hex64 Metadata;
std::optional<std::vector<llvm::yaml::Hex64>> CallsiteEndOffsets;
+ std::optional<llvm::yaml::Hex64> Hash;
};
uint8_t Version;
llvm::yaml::Hex8 Feature;
diff --git a/llvm/include/llvm/ProfileData/InstrProfCorrelator.h b/llvm/include/llvm/ProfileData/InstrProfCorrelator.h
index d460eb1..1617ae7 100644
--- a/llvm/include/llvm/ProfileData/InstrProfCorrelator.h
+++ b/llvm/include/llvm/ProfileData/InstrProfCorrelator.h
@@ -13,6 +13,7 @@
#define LLVM_PROFILEDATA_INSTRPROFCORRELATOR_H
#include "llvm/ADT/DenseSet.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
#include "llvm/Debuginfod/BuildIDFetcher.h"
#include "llvm/Object/BuildID.h"
#include "llvm/ProfileData/InstrProf.h"
@@ -24,7 +25,6 @@
#include <vector>
namespace llvm {
-class DWARFContext;
class DWARFDie;
namespace object {
class ObjectFile;
diff --git a/llvm/include/llvm/Support/BinaryStreamWriter.h b/llvm/include/llvm/Support/BinaryStreamWriter.h
index dddf53b..39ce0b6 100644
--- a/llvm/include/llvm/Support/BinaryStreamWriter.h
+++ b/llvm/include/llvm/Support/BinaryStreamWriter.h
@@ -10,6 +10,7 @@
#define LLVM_SUPPORT_BINARYSTREAMWRITER_H
#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLForwardCompat.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/BinaryStreamArray.h"
#include "llvm/Support/BinaryStreamError.h"
@@ -69,8 +70,7 @@ public:
static_assert(std::is_enum<T>::value,
"Cannot call writeEnum with non-Enum type");
- using U = std::underlying_type_t<T>;
- return writeInteger<U>(static_cast<U>(Num));
+ return writeInteger(llvm::to_underlying(Num));
}
/// Write the unsigned integer Value to the underlying stream using ULEB128
diff --git a/llvm/include/llvm/Support/Caching.h b/llvm/include/llvm/Support/Caching.h
index 7fd9bef..cebf071 100644
--- a/llvm/include/llvm/Support/Caching.h
+++ b/llvm/include/llvm/Support/Caching.h
@@ -17,11 +17,10 @@
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Error.h"
+#include "llvm/Support/MemoryBuffer.h"
namespace llvm {
-class MemoryBuffer;
-
/// This class wraps an output stream for a file. Most clients should just be
/// able to return an instance of this base class from the stream callback, but
/// if a client needs to perform some action after the stream is written to,
diff --git a/llvm/include/llvm/Support/DebugCounter.h b/llvm/include/llvm/Support/DebugCounter.h
index 48fc600..39a08d4 100644
--- a/llvm/include/llvm/Support/DebugCounter.h
+++ b/llvm/include/llvm/Support/DebugCounter.h
@@ -178,6 +178,7 @@ protected:
std::string Desc;
SmallVector<Chunk> Chunks;
};
+ bool handleCounterIncrement(CounterInfo &Info);
DenseMap<unsigned, CounterInfo> Counters;
CounterVector RegisteredCounters;
@@ -188,6 +189,8 @@ protected:
bool ShouldPrintCounter = false;
+ bool ShouldPrintCounterQueries = false;
+
bool BreakOnLast = false;
};
diff --git a/llvm/include/llvm/Support/DebugLog.h b/llvm/include/llvm/Support/DebugLog.h
index 7025ca14..fd67d7a 100644
--- a/llvm/include/llvm/Support/DebugLog.h
+++ b/llvm/include/llvm/Support/DebugLog.h
@@ -221,12 +221,10 @@ constexpr ::llvm::StringRef strip_quotes(const char *Str) {
#define LDBG_GET_DEBUG_TYPE_STR() LDBG_GET_DEBUG_TYPE_STR_(DEBUG_TYPE)
/// Helper to call isCurrentDebugType with a StringRef.
-static LLVM_ATTRIBUTE_UNUSED bool ldbgIsCurrentDebugType(StringRef Type,
- int Level) {
+[[maybe_unused]] static bool ldbgIsCurrentDebugType(StringRef Type, int Level) {
return ::llvm::isCurrentDebugType(Type.str().c_str(), Level);
}
-static LLVM_ATTRIBUTE_UNUSED bool ldbgIsCurrentDebugType(int Level,
- StringRef Type) {
+[[maybe_unused]] static bool ldbgIsCurrentDebugType(int Level, StringRef Type) {
return ::llvm::isCurrentDebugType(Type.str().c_str(), Level);
}
@@ -302,7 +300,7 @@ public:
};
/// Remove the path prefix from the file name.
-static LLVM_ATTRIBUTE_UNUSED constexpr const char *
+[[maybe_unused]] static constexpr const char *
getShortFileName(const char *path) {
const char *filename = path;
for (const char *p = path; *p != '\0'; ++p) {
@@ -315,7 +313,7 @@ getShortFileName(const char *path) {
/// Compute the prefix for the debug log in the form of:
/// "[DebugType] File:Line "
/// Where the File is the file name without the path prefix.
-static LLVM_ATTRIBUTE_UNUSED std::string
+[[maybe_unused]] static std::string
computePrefix(StringRef DebugType, const char *File, int Line, int Level) {
std::string Prefix;
raw_string_ostream OsPrefix(Prefix);
@@ -326,7 +324,7 @@ computePrefix(StringRef DebugType, const char *File, int Line, int Level) {
return OsPrefix.str();
}
/// Overload allowing to swap the order of the DebugType and Level arguments.
-static LLVM_ATTRIBUTE_UNUSED std::string
+[[maybe_unused]] static std::string
computePrefix(int Level, const char *File, int Line, StringRef DebugType) {
return computePrefix(DebugType, File, Line, Level);
}
diff --git a/llvm/include/llvm/Support/Format.h b/llvm/include/llvm/Support/Format.h
index 34b224d..b549341 100644
--- a/llvm/include/llvm/Support/Format.h
+++ b/llvm/include/llvm/Support/Format.h
@@ -78,9 +78,20 @@ public:
/// printed, this synthesizes the string into a temporary buffer provided and
/// returns whether or not it is big enough.
+namespace detail {
+template <typename T> struct decay_if_c_char_array {
+ using type = T;
+};
+template <std::size_t N> struct decay_if_c_char_array<char[N]> {
+ using type = const char *;
+};
+template <typename T>
+using decay_if_c_char_array_t = typename decay_if_c_char_array<T>::type;
+} // namespace detail
+
template <typename... Ts>
class format_object final : public format_object_base {
- std::tuple<Ts...> Vals;
+ std::tuple<detail::decay_if_c_char_array_t<Ts>...> Vals;
template <std::size_t... Is>
int snprint_tuple(char *Buffer, unsigned BufferSize,
@@ -96,7 +107,7 @@ public:
format_object(const char *fmt, const Ts &... vals)
: format_object_base(fmt), Vals(vals...) {
static_assert(
- (std::is_scalar_v<Ts> && ...),
+ (std::is_scalar_v<detail::decay_if_c_char_array_t<Ts>> && ...),
"format can't be used with non fundamental / non pointer type");
}
diff --git a/llvm/include/llvm/Support/ScopedPrinter.h b/llvm/include/llvm/Support/ScopedPrinter.h
index 94080e8..7b87fda 100644
--- a/llvm/include/llvm/Support/ScopedPrinter.h
+++ b/llvm/include/llvm/Support/ScopedPrinter.h
@@ -11,6 +11,7 @@
#include "llvm/ADT/APSInt.h"
#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLForwardCompat.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
@@ -57,8 +58,7 @@ struct HexNumber {
HexNumber(unsigned long Value) : Value(Value) {}
HexNumber(unsigned long long Value) : Value(Value) {}
template <typename EnumT, typename = std::enable_if_t<std::is_enum_v<EnumT>>>
- HexNumber(EnumT Value)
- : HexNumber(static_cast<std::underlying_type_t<EnumT>>(Value)) {}
+ HexNumber(EnumT Value) : HexNumber(llvm::to_underlying(Value)) {}
uint64_t Value;
};
@@ -84,7 +84,7 @@ struct FlagEntry {
: Name(Name), Value(Value) {}
template <typename EnumT, typename = std::enable_if_t<std::is_enum_v<EnumT>>>
FlagEntry(StringRef Name, EnumT Value)
- : FlagEntry(Name, static_cast<std::underlying_type_t<EnumT>>(Value)) {}
+ : FlagEntry(Name, llvm::to_underlying(Value)) {}
StringRef Name;
uint64_t Value;
diff --git a/llvm/include/llvm/Support/SourceMgr.h b/llvm/include/llvm/Support/SourceMgr.h
index 5637b64..8320006 100644
--- a/llvm/include/llvm/Support/SourceMgr.h
+++ b/llvm/include/llvm/Support/SourceMgr.h
@@ -15,6 +15,7 @@
#ifndef LLVM_SUPPORT_SOURCEMGR_H
#define LLVM_SUPPORT_SOURCEMGR_H
+#include "llvm/ADT/IntrusiveRefCntPtr.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/MemoryBuffer.h"
@@ -23,6 +24,10 @@
namespace llvm {
+namespace vfs {
+class FileSystem;
+} // end namespace vfs
+
class raw_ostream;
class SMDiagnostic;
class SMFixIt;
@@ -91,15 +96,25 @@ private:
DiagHandlerTy DiagHandler = nullptr;
void *DiagContext = nullptr;
+ // Optional file system for finding include files.
+ IntrusiveRefCntPtr<vfs::FileSystem> FS;
+
bool isValidBufferID(unsigned i) const { return i && i <= Buffers.size(); }
public:
- SourceMgr() = default;
+ /// Create new source manager without support for include files.
+ SourceMgr();
+ /// Create new source manager with the capability of finding include files
+ /// via the provided file system.
+ explicit SourceMgr(IntrusiveRefCntPtr<vfs::FileSystem> FS);
SourceMgr(const SourceMgr &) = delete;
SourceMgr &operator=(const SourceMgr &) = delete;
- SourceMgr(SourceMgr &&) = default;
- SourceMgr &operator=(SourceMgr &&) = default;
- ~SourceMgr() = default;
+ SourceMgr(SourceMgr &&);
+ SourceMgr &operator=(SourceMgr &&);
+ ~SourceMgr();
+
+ IntrusiveRefCntPtr<vfs::FileSystem> getVirtualFileSystem() const;
+ void setVirtualFileSystem(IntrusiveRefCntPtr<vfs::FileSystem> FS);
/// Return the include directories of this source manager.
ArrayRef<std::string> getIncludeDirs() const { return IncludeDirectories; }
diff --git a/llvm/include/llvm/Support/SpecialCaseList.h b/llvm/include/llvm/Support/SpecialCaseList.h
index 466e2a4..ead7655 100644
--- a/llvm/include/llvm/Support/SpecialCaseList.h
+++ b/llvm/include/llvm/Support/SpecialCaseList.h
@@ -115,7 +115,8 @@ protected:
// classes.
LLVM_ABI bool createInternal(const std::vector<std::string> &Paths,
vfs::FileSystem &VFS, std::string &Error);
- LLVM_ABI bool createInternal(const MemoryBuffer *MB, std::string &Error);
+ LLVM_ABI bool createInternal(const MemoryBuffer *MB, std::string &Error,
+ bool OrderBySize = false);
SpecialCaseList() = default;
SpecialCaseList(SpecialCaseList const &) = delete;
@@ -126,6 +127,8 @@ private:
class RegexMatcher {
public:
LLVM_ABI Error insert(StringRef Pattern, unsigned LineNumber);
+ LLVM_ABI void preprocess(bool BySize);
+
LLVM_ABI void
match(StringRef Query,
llvm::function_ref<void(StringRef Rule, unsigned LineNo)> Cb) const;
@@ -144,6 +147,8 @@ private:
class GlobMatcher {
public:
LLVM_ABI Error insert(StringRef Pattern, unsigned LineNumber);
+ LLVM_ABI void preprocess(bool BySize);
+
LLVM_ABI void
match(StringRef Query,
llvm::function_ref<void(StringRef Rule, unsigned LineNo)> Cb) const;
@@ -164,6 +169,9 @@ private:
public:
LLVM_ABI Matcher(bool UseGlobs, bool RemoveDotSlash);
+ LLVM_ABI Error insert(StringRef Pattern, unsigned LineNumber);
+ LLVM_ABI void preprocess(bool BySize);
+
LLVM_ABI void
match(StringRef Query,
llvm::function_ref<void(StringRef Rule, unsigned LineNo)> Cb) const;
@@ -174,8 +182,6 @@ private:
return R;
}
- LLVM_ABI Error insert(StringRef Pattern, unsigned LineNumber);
-
std::variant<RegexMatcher, GlobMatcher> M;
bool RemoveDotSlash;
};
@@ -206,6 +212,8 @@ protected:
StringRef Category) const;
private:
+ friend class SpecialCaseList;
+ LLVM_ABI void preprocess(bool OrderBySize);
LLVM_ABI const SpecialCaseList::Matcher *
findMatcher(StringRef Prefix, StringRef Category) const;
};
@@ -222,7 +230,7 @@ private:
/// Parses just-constructed SpecialCaseList entries from a memory buffer.
LLVM_ABI bool parse(unsigned FileIdx, const MemoryBuffer *MB,
- std::string &Error);
+ std::string &Error, bool OrderBySize);
};
} // namespace llvm
diff --git a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
index faaff4a..4aa6c01 100644
--- a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
+++ b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
@@ -121,6 +121,7 @@ enum attributeBits {
"The Dynamic Duo! Prefer over all else because this changes " \
"most operands' meaning") \
ENUM_ENTRY(IC_64BIT_REX2, 2, "requires a REX2 prefix") \
+ ENUM_ENTRY(IC_64BIT_REX2_REXW, 3, "requires a REX2 and the W prefix") \
ENUM_ENTRY(IC_VEX, 1, "requires a VEX prefix") \
ENUM_ENTRY(IC_VEX_XS, 2, "requires VEX and the XS prefix") \
ENUM_ENTRY(IC_VEX_XD, 2, "requires VEX and the XD prefix") \
diff --git a/llvm/include/llvm/TableGen/CodeGenHelpers.h b/llvm/include/llvm/TableGen/CodeGenHelpers.h
index 7dca6a0..e22c6d4 100644
--- a/llvm/include/llvm/TableGen/CodeGenHelpers.h
+++ b/llvm/include/llvm/TableGen/CodeGenHelpers.h
@@ -34,32 +34,56 @@ private:
raw_ostream &OS;
};
+// Simple RAII helper for emitting header include guard (ifndef-define-endif).
+class IncludeGuardEmitter {
+public:
+ IncludeGuardEmitter(raw_ostream &OS, StringRef Name)
+ : Name(Name.str()), OS(OS) {
+ OS << "#ifndef " << Name << "\n"
+ << "#define " << Name << "\n\n";
+ }
+ ~IncludeGuardEmitter() { OS << "\n#endif // " << Name << "\n"; }
+
+private:
+ std::string Name;
+ raw_ostream &OS;
+};
+
// Simple RAII helper for emitting namespace scope. Name can be a single
-// namespace (empty for anonymous namespace) or nested namespace.
+// namespace or nested namespace. If the name is empty, will not generate any
+// namespace scope.
class NamespaceEmitter {
public:
- NamespaceEmitter(raw_ostream &OS, StringRef Name) : OS(OS) {
- emitNamespaceStarts(Name);
+ NamespaceEmitter(raw_ostream &OS, StringRef NameUntrimmed)
+ : Name(trim(NameUntrimmed).str()), OS(OS) {
+ if (!Name.empty())
+ OS << "namespace " << Name << " {\n";
}
~NamespaceEmitter() { close(); }
// Explicit function to close the namespace scopes.
void close() {
- for (StringRef NS : llvm::reverse(Namespaces))
- OS << "} // namespace " << NS << "\n";
- Namespaces.clear();
+ if (!Closed && !Name.empty())
+ OS << "} // namespace " << Name << "\n";
+ Closed = true;
}
private:
- void emitNamespaceStarts(StringRef Name) {
- llvm::SplitString(Name, Namespaces, "::");
- for (StringRef NS : Namespaces)
- OS << "namespace " << NS << " {\n";
+ // Trim "::" prefix. If the namespace specified is ""::mlir::toy", then the
+ // generated namespace scope needs to use
+ //
+ // namespace mlir::toy {
+ // }
+ //
+ // and cannot use "namespace ::mlir::toy".
+ static StringRef trim(StringRef Name) {
+ Name.consume_front("::");
+ return Name;
}
-
- SmallVector<StringRef, 2> Namespaces;
+ std::string Name;
raw_ostream &OS;
+ bool Closed = false;
};
} // end namespace llvm
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 632be7a..07a858f 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -535,6 +535,7 @@ def fdiv : SDNode<"ISD::FDIV" , SDTFPBinOp>;
def frem : SDNode<"ISD::FREM" , SDTFPBinOp>;
def fma : SDNode<"ISD::FMA" , SDTFPTernaryOp, [SDNPCommutative]>;
def fmad : SDNode<"ISD::FMAD" , SDTFPTernaryOp, [SDNPCommutative]>;
+def fmuladd : SDNode<"ISD::FMULADD" , SDTFPTernaryOp, [SDNPCommutative]>;
def fabs : SDNode<"ISD::FABS" , SDTFPUnaryOp>;
def fminnum : SDNode<"ISD::FMINNUM" , SDTFPBinOp,
[SDNPCommutative, SDNPAssociative]>;
diff --git a/llvm/include/llvm/TargetParser/RISCVTargetParser.h b/llvm/include/llvm/TargetParser/RISCVTargetParser.h
index b1fca55..2ac58a5 100644
--- a/llvm/include/llvm/TargetParser/RISCVTargetParser.h
+++ b/llvm/include/llvm/TargetParser/RISCVTargetParser.h
@@ -161,6 +161,8 @@ inline static bool isAltFmt(unsigned VType) { return VType & 0x100; }
LLVM_ABI void printVType(unsigned VType, raw_ostream &OS);
+LLVM_ABI void printXSfmmVType(unsigned VType, raw_ostream &OS);
+
LLVM_ABI unsigned getSEWLMULRatio(unsigned SEW, VLMUL VLMul);
LLVM_ABI std::optional<VLMUL> getSameRatioLMUL(unsigned SEW, VLMUL VLMUL,
diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def
index e62aa6d..a94eab1 100644
--- a/llvm/include/llvm/TargetParser/X86TargetParser.def
+++ b/llvm/include/llvm/TargetParser/X86TargetParser.def
@@ -108,6 +108,7 @@ X86_CPU_SUBTYPE(INTEL_COREI7_ARROWLAKE_S, "arrowlake-s")
X86_CPU_SUBTYPE(INTEL_COREI7_PANTHERLAKE, "pantherlake")
X86_CPU_SUBTYPE(AMDFAM1AH_ZNVER5, "znver5")
X86_CPU_SUBTYPE(INTEL_COREI7_DIAMONDRAPIDS, "diamondrapids")
+X86_CPU_SUBTYPE(INTEL_COREI7_NOVALAKE, "novalake")
// Alternate names supported by __builtin_cpu_is and target multiversioning.
X86_CPU_SUBTYPE_ALIAS(INTEL_COREI7_ALDERLAKE, "raptorlake")
@@ -115,6 +116,7 @@ X86_CPU_SUBTYPE_ALIAS(INTEL_COREI7_ALDERLAKE, "meteorlake")
X86_CPU_SUBTYPE_ALIAS(INTEL_COREI7_SAPPHIRERAPIDS, "emeraldrapids")
X86_CPU_SUBTYPE_ALIAS(INTEL_COREI7_ARROWLAKE_S,"lunarlake")
X86_CPU_SUBTYPE_ALIAS(INTEL_COREI7_ALDERLAKE, "gracemont")
+X86_CPU_SUBTYPE_ALIAS(INTEL_COREI7_PANTHERLAKE, "wildcatlake")
#undef X86_CPU_SUBTYPE_ALIAS
#undef X86_CPU_SUBTYPE
diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.h b/llvm/include/llvm/TargetParser/X86TargetParser.h
index f6aeaad..80f3d35 100644
--- a/llvm/include/llvm/TargetParser/X86TargetParser.h
+++ b/llvm/include/llvm/TargetParser/X86TargetParser.h
@@ -116,6 +116,8 @@ enum CPUKind {
CK_ArrowlakeS,
CK_Lunarlake,
CK_Pantherlake,
+ CK_Wildcatlake,
+ CK_Novalake,
CK_Sierraforest,
CK_Grandridge,
CK_Graniterapids,
diff --git a/llvm/include/llvm/Transforms/Coroutines/MaterializationUtils.h b/llvm/include/llvm/Transforms/Coroutines/MaterializationUtils.h
index 558984f..eb2b34d 100644
--- a/llvm/include/llvm/Transforms/Coroutines/MaterializationUtils.h
+++ b/llvm/include/llvm/Transforms/Coroutines/MaterializationUtils.h
@@ -12,9 +12,7 @@
#ifndef LLVM_TRANSFORMS_COROUTINES_MATERIALIZATIONUTILS_H
#define LLVM_TRANSFORMS_COROUTINES_MATERIALIZATIONUTILS_H
-namespace llvm {
-
-namespace coro {
+namespace llvm::coro {
// True if I is trivially rematerialzable, e.g. InsertElementInst
LLVM_ABI bool isTriviallyMaterializable(Instruction &I);
@@ -24,8 +22,6 @@ LLVM_ABI void
doRematerializations(Function &F, SuspendCrossingInfo &Checker,
std::function<bool(Instruction &)> IsMaterializable);
-} // namespace coro
-
-} // namespace llvm
+} // namespace llvm::coro
#endif // LLVM_TRANSFORMS_COROUTINES_MATERIALIZATIONUTILS_H
diff --git a/llvm/include/llvm/Transforms/Coroutines/SpillUtils.h b/llvm/include/llvm/Transforms/Coroutines/SpillUtils.h
index 6cdf83c0..356f9ca 100644
--- a/llvm/include/llvm/Transforms/Coroutines/SpillUtils.h
+++ b/llvm/include/llvm/Transforms/Coroutines/SpillUtils.h
@@ -13,9 +13,7 @@
#ifndef LLVM_TRANSFORMS_COROUTINES_SPILLINGINFO_H
#define LLVM_TRANSFORMS_COROUTINES_SPILLINGINFO_H
-namespace llvm {
-
-namespace coro {
+namespace llvm::coro {
using SpillInfo = SmallMapVector<Value *, SmallVector<Instruction *, 2>, 8>;
@@ -38,6 +36,7 @@ void collectSpillsAndAllocasFromInsts(
SmallVector<CoroAllocaAllocInst *, 4> &LocalAllocas, Function &F,
const SuspendCrossingInfo &Checker, const DominatorTree &DT,
const coro::Shape &Shape);
+
void collectSpillsFromDbgInfo(SpillInfo &Spills, Function &F,
const SuspendCrossingInfo &Checker);
@@ -52,8 +51,6 @@ void sinkSpillUsesAfterCoroBegin(const DominatorTree &DT,
BasicBlock::iterator getSpillInsertionPt(const coro::Shape &, Value *Def,
const DominatorTree &DT);
-} // namespace coro
-
-} // namespace llvm
+} // namespace llvm::coro
#endif // LLVM_TRANSFORMS_COROUTINES_SPILLINGINFO_H
diff --git a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
index f2de083..576f1eb 100644
--- a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
+++ b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
@@ -95,6 +95,16 @@ public:
function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
isPrevailing);
};
+
+/// Strips MemProf attributes and metadata. Can be invoked by the pass pipeline
+/// when we don't have an index that has recorded that we are linking with
+/// allocation libraries containing the necessary APIs for downstream
+/// transformations.
+class MemProfRemoveInfo : public PassInfoMixin<MemProfRemoveInfo> {
+public:
+ PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
} // end namespace llvm
#endif // LLVM_TRANSFORMS_IPO_MEMPROF_CONTEXT_DISAMBIGUATION_H
diff --git a/llvm/include/llvm/Transforms/Utils/SSAUpdaterBulk.h b/llvm/include/llvm/Transforms/Utils/SSAUpdaterBulk.h
index 48e8c86..2db3f6d4 100644
--- a/llvm/include/llvm/Transforms/Utils/SSAUpdaterBulk.h
+++ b/llvm/include/llvm/Transforms/Utils/SSAUpdaterBulk.h
@@ -13,7 +13,6 @@
#ifndef LLVM_TRANSFORMS_UTILS_SSAUPDATERBULK_H
#define LLVM_TRANSFORMS_UTILS_SSAUPDATERBULK_H
-#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/IR/PredIteratorCache.h"
#include "llvm/Support/Compiler.h"
@@ -79,6 +78,10 @@ public:
LLVM_ABI void
RewriteAllUses(DominatorTree *DT,
SmallVectorImpl<PHINode *> *InsertedPHIs = nullptr);
+
+ /// Rewrite all uses and simplify the inserted PHI nodes.
+ /// Use this method to preserve behavior when replacing SSAUpdater.
+ void RewriteAndOptimizeAllUses(DominatorTree &DT);
};
} // end namespace llvm
diff --git a/llvm/include/llvm/XRay/BlockIndexer.h b/llvm/include/llvm/XRay/BlockIndexer.h
index e9782da..155e6bd 100644
--- a/llvm/include/llvm/XRay/BlockIndexer.h
+++ b/llvm/include/llvm/XRay/BlockIndexer.h
@@ -19,8 +19,7 @@
#include <cstdint>
#include <vector>
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
// The BlockIndexer will gather all related records associated with a
// process+thread and group them by 'Block'.
@@ -63,7 +62,6 @@ public:
Error flush();
};
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
#endif // LLVM_XRAY_BLOCKINDEXER_H
diff --git a/llvm/include/llvm/XRay/BlockPrinter.h b/llvm/include/llvm/XRay/BlockPrinter.h
index caf78c5..81944a5 100644
--- a/llvm/include/llvm/XRay/BlockPrinter.h
+++ b/llvm/include/llvm/XRay/BlockPrinter.h
@@ -18,8 +18,7 @@
#include "llvm/XRay/FDRRecords.h"
#include "llvm/XRay/RecordPrinter.h"
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
class LLVM_ABI BlockPrinter : public RecordVisitor {
enum class State {
@@ -55,7 +54,6 @@ public:
void reset() { CurrentState = State::Start; }
};
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
#endif // LLVM_XRAY_BLOCKPRINTER_H
diff --git a/llvm/include/llvm/XRay/BlockVerifier.h b/llvm/include/llvm/XRay/BlockVerifier.h
index b88785c..5e7b25c 100644
--- a/llvm/include/llvm/XRay/BlockVerifier.h
+++ b/llvm/include/llvm/XRay/BlockVerifier.h
@@ -16,8 +16,7 @@
#include "llvm/Support/Compiler.h"
#include "llvm/XRay/FDRRecords.h"
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
class LLVM_ABI BlockVerifier : public RecordVisitor {
public:
@@ -64,7 +63,6 @@ public:
void reset();
};
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
#endif // LLVM_XRAY_BLOCKVERIFIER_H
diff --git a/llvm/include/llvm/XRay/FDRLogBuilder.h b/llvm/include/llvm/XRay/FDRLogBuilder.h
index f07c446..5f7b815 100644
--- a/llvm/include/llvm/XRay/FDRLogBuilder.h
+++ b/llvm/include/llvm/XRay/FDRLogBuilder.h
@@ -10,8 +10,7 @@
#include "llvm/XRay/FDRRecords.h"
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
/// The LogBuilder class allows for creating ad-hoc collections of records
/// through the `add<...>(...)` function. An example use of this API is in
@@ -34,7 +33,6 @@ public:
std::vector<std::unique_ptr<Record>> consume() { return std::move(Records); }
};
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
#endif // LLVM_XRAY_FDRLOGBUILDER_H
diff --git a/llvm/include/llvm/XRay/FDRRecordConsumer.h b/llvm/include/llvm/XRay/FDRRecordConsumer.h
index 473777f..13bb711 100644
--- a/llvm/include/llvm/XRay/FDRRecordConsumer.h
+++ b/llvm/include/llvm/XRay/FDRRecordConsumer.h
@@ -15,8 +15,7 @@
#include <memory>
#include <vector>
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
class RecordConsumer {
public:
@@ -48,7 +47,6 @@ public:
Error consume(std::unique_ptr<Record> R) override;
};
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
#endif // LLVM_XRAY_FDRRECORDCONSUMER_H
diff --git a/llvm/include/llvm/XRay/FDRRecordProducer.h b/llvm/include/llvm/XRay/FDRRecordProducer.h
index 083b571..b953f62 100644
--- a/llvm/include/llvm/XRay/FDRRecordProducer.h
+++ b/llvm/include/llvm/XRay/FDRRecordProducer.h
@@ -14,8 +14,7 @@
#include "llvm/XRay/XRayRecord.h"
#include <memory>
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
class RecordProducer {
public:
@@ -45,7 +44,6 @@ public:
Expected<std::unique_ptr<Record>> produce() override;
};
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
#endif // LLVM_XRAY_FDRRECORDPRODUCER_H
diff --git a/llvm/include/llvm/XRay/FDRRecords.h b/llvm/include/llvm/XRay/FDRRecords.h
index 7ee8db6..91689cae 100644
--- a/llvm/include/llvm/XRay/FDRRecords.h
+++ b/llvm/include/llvm/XRay/FDRRecords.h
@@ -23,8 +23,7 @@
#include "llvm/Support/Error.h"
#include "llvm/XRay/XRayRecord.h"
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
class RecordVisitor;
class RecordInitializer;
@@ -444,7 +443,6 @@ public:
Error visit(TypedEventRecord &) override;
};
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
#endif // LLVM_XRAY_FDRRECORDS_H
diff --git a/llvm/include/llvm/XRay/FDRTraceExpander.h b/llvm/include/llvm/XRay/FDRTraceExpander.h
index 197c123..ca400c9 100644
--- a/llvm/include/llvm/XRay/FDRTraceExpander.h
+++ b/llvm/include/llvm/XRay/FDRTraceExpander.h
@@ -17,8 +17,7 @@
#include "llvm/XRay/FDRRecords.h"
#include "llvm/XRay/XRayRecord.h"
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
class TraceExpander : public RecordVisitor {
// Type-erased callback for handling individual XRayRecord instances.
@@ -56,7 +55,6 @@ public:
Error flush();
};
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
#endif // LLVM_XRAY_FDRTRACEEXPANDER_H
diff --git a/llvm/include/llvm/XRay/FDRTraceWriter.h b/llvm/include/llvm/XRay/FDRTraceWriter.h
index a3dc58e..957039d 100644
--- a/llvm/include/llvm/XRay/FDRTraceWriter.h
+++ b/llvm/include/llvm/XRay/FDRTraceWriter.h
@@ -18,8 +18,7 @@
#include "llvm/XRay/FDRRecords.h"
#include "llvm/XRay/XRayRecord.h"
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
/// The FDRTraceWriter allows us to hand-craft an XRay Flight Data Recorder
/// (FDR) mode log file. This is used primarily for testing, generating
@@ -50,7 +49,6 @@ private:
support::endian::Writer OS;
};
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
#endif // LLVM_XRAY_FDRTRACEWRITER_H
diff --git a/llvm/include/llvm/XRay/FileHeaderReader.h b/llvm/include/llvm/XRay/FileHeaderReader.h
index ecdb975..758ca29 100644
--- a/llvm/include/llvm/XRay/FileHeaderReader.h
+++ b/llvm/include/llvm/XRay/FileHeaderReader.h
@@ -19,15 +19,13 @@
#include "llvm/XRay/XRayRecord.h"
#include <cstdint>
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
/// Convenience function for loading the file header given a data extractor at a
/// specified offset.
LLVM_ABI Expected<XRayFileHeader>
readBinaryFormatHeader(DataExtractor &HeaderExtractor, uint64_t &OffsetPtr);
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
#endif // LLVM_XRAY_FILEHEADERREADER_H
diff --git a/llvm/include/llvm/XRay/Graph.h b/llvm/include/llvm/XRay/Graph.h
index 07b418b..8521e09 100644
--- a/llvm/include/llvm/XRay/Graph.h
+++ b/llvm/include/llvm/XRay/Graph.h
@@ -23,8 +23,7 @@
#include "llvm/ADT/iterator.h"
#include "llvm/Support/Error.h"
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
/// A Graph object represents a Directed Graph and is used in XRay to compute
/// and store function call graphs and associated statistical information.
@@ -485,6 +484,6 @@ public:
return p;
}
};
-}
-}
+} // namespace llvm::xray
+
#endif
diff --git a/llvm/include/llvm/XRay/InstrumentationMap.h b/llvm/include/llvm/XRay/InstrumentationMap.h
index b5371478..c5e7ebf 100644
--- a/llvm/include/llvm/XRay/InstrumentationMap.h
+++ b/llvm/include/llvm/XRay/InstrumentationMap.h
@@ -23,9 +23,7 @@
#include <unordered_map>
#include <vector>
-namespace llvm {
-
-namespace xray {
+namespace llvm::xray {
// Forward declare to make a friend.
class InstrumentationMap;
@@ -102,11 +100,11 @@ public:
const SledContainer &sleds() const { return Sleds; };
};
-} // end namespace xray
-
-namespace yaml {
+} // end namespace llvm::xray
-template <> struct ScalarEnumerationTraits<xray::SledEntry::FunctionKinds> {
+namespace llvm {
+template <>
+struct yaml::ScalarEnumerationTraits<xray::SledEntry::FunctionKinds> {
static void enumeration(IO &IO, xray::SledEntry::FunctionKinds &Kind) {
IO.enumCase(Kind, "function-enter", xray::SledEntry::FunctionKinds::ENTRY);
IO.enumCase(Kind, "function-exit", xray::SledEntry::FunctionKinds::EXIT);
@@ -118,7 +116,7 @@ template <> struct ScalarEnumerationTraits<xray::SledEntry::FunctionKinds> {
}
};
-template <> struct MappingTraits<xray::YAMLXRaySledEntry> {
+template <> struct yaml::MappingTraits<xray::YAMLXRaySledEntry> {
static void mapping(IO &IO, xray::YAMLXRaySledEntry &Entry) {
IO.mapRequired("id", Entry.FuncId);
IO.mapRequired("address", Entry.Address);
@@ -131,10 +129,7 @@ template <> struct MappingTraits<xray::YAMLXRaySledEntry> {
static constexpr bool flow = true;
};
-
-} // end namespace yaml
-
-} // end namespace llvm
+} // namespace llvm
LLVM_YAML_IS_SEQUENCE_VECTOR(xray::YAMLXRaySledEntry)
diff --git a/llvm/include/llvm/XRay/Profile.h b/llvm/include/llvm/XRay/Profile.h
index e30c01e..b5b8dd2 100644
--- a/llvm/include/llvm/XRay/Profile.h
+++ b/llvm/include/llvm/XRay/Profile.h
@@ -22,8 +22,7 @@
#include <utility>
#include <vector>
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
class Profile;
@@ -144,7 +143,6 @@ public:
bool empty() const { return Blocks.empty(); }
};
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
#endif
diff --git a/llvm/include/llvm/XRay/RecordPrinter.h b/llvm/include/llvm/XRay/RecordPrinter.h
index 5d2c277..3281221 100644
--- a/llvm/include/llvm/XRay/RecordPrinter.h
+++ b/llvm/include/llvm/XRay/RecordPrinter.h
@@ -17,8 +17,7 @@
#include "llvm/Support/raw_ostream.h"
#include "llvm/XRay/FDRRecords.h"
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
class LLVM_ABI RecordPrinter : public RecordVisitor {
raw_ostream &OS;
@@ -44,7 +43,6 @@ public:
Error visit(TypedEventRecord &) override;
};
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
#endif // LLVM_XRAY_RECORDPRINTER_H
diff --git a/llvm/include/llvm/XRay/Trace.h b/llvm/include/llvm/XRay/Trace.h
index 5e4e40a..13ada22 100644
--- a/llvm/include/llvm/XRay/Trace.h
+++ b/llvm/include/llvm/XRay/Trace.h
@@ -21,8 +21,7 @@
#include "llvm/Support/Error.h"
#include "llvm/XRay/XRayRecord.h"
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
/// A Trace object represents the records that have been loaded from XRay
/// log files generated by instrumented binaries. We encapsulate the logic of
@@ -76,7 +75,6 @@ LLVM_ABI Expected<Trace> loadTraceFile(StringRef Filename, bool Sort = false);
LLVM_ABI Expected<Trace> loadTrace(const DataExtractor &Extractor,
bool Sort = false);
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
#endif // LLVM_XRAY_TRACE_H
diff --git a/llvm/include/llvm/XRay/XRayRecord.h b/llvm/include/llvm/XRay/XRayRecord.h
index 238bf3d..8f3440c 100644
--- a/llvm/include/llvm/XRay/XRayRecord.h
+++ b/llvm/include/llvm/XRay/XRayRecord.h
@@ -18,8 +18,7 @@
#include <vector>
#include <string>
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
/// XRay traces all have a header providing some top-matter information useful
/// to help tools determine how to interpret the information available in the
@@ -98,7 +97,6 @@ struct XRayRecord {
std::string Data;
};
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
#endif // LLVM_XRAY_XRAYRECORD_H
diff --git a/llvm/include/llvm/XRay/YAMLXRayRecord.h b/llvm/include/llvm/XRay/YAMLXRayRecord.h
index 6062606..6bf4f1d 100644
--- a/llvm/include/llvm/XRay/YAMLXRayRecord.h
+++ b/llvm/include/llvm/XRay/YAMLXRayRecord.h
@@ -17,8 +17,7 @@
#include "llvm/Support/YAMLTraits.h"
#include "llvm/XRay/XRayRecord.h"
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
struct YAMLXRayFileHeader {
uint16_t Version;
@@ -46,13 +45,12 @@ struct YAMLXRayTrace {
std::vector<YAMLXRayRecord> Records;
};
-} // namespace xray
-
-namespace yaml {
+} // namespace llvm::xray
+namespace llvm {
// YAML Traits
// -----------
-template <> struct ScalarEnumerationTraits<xray::RecordTypes> {
+template <> struct yaml::ScalarEnumerationTraits<xray::RecordTypes> {
static void enumeration(IO &IO, xray::RecordTypes &Type) {
IO.enumCase(Type, "function-enter", xray::RecordTypes::ENTER);
IO.enumCase(Type, "function-exit", xray::RecordTypes::EXIT);
@@ -63,7 +61,7 @@ template <> struct ScalarEnumerationTraits<xray::RecordTypes> {
}
};
-template <> struct MappingTraits<xray::YAMLXRayFileHeader> {
+template <> struct yaml::MappingTraits<xray::YAMLXRayFileHeader> {
static void mapping(IO &IO, xray::YAMLXRayFileHeader &Header) {
IO.mapRequired("version", Header.Version);
IO.mapRequired("type", Header.Type);
@@ -73,7 +71,7 @@ template <> struct MappingTraits<xray::YAMLXRayFileHeader> {
}
};
-template <> struct MappingTraits<xray::YAMLXRayRecord> {
+template <> struct yaml::MappingTraits<xray::YAMLXRayRecord> {
static void mapping(IO &IO, xray::YAMLXRayRecord &Record) {
IO.mapRequired("type", Record.RecordType);
IO.mapOptional("func-id", Record.FuncId);
@@ -90,7 +88,7 @@ template <> struct MappingTraits<xray::YAMLXRayRecord> {
static constexpr bool flow = true;
};
-template <> struct MappingTraits<xray::YAMLXRayTrace> {
+template <> struct yaml::MappingTraits<llvm::xray::YAMLXRayTrace> {
static void mapping(IO &IO, xray::YAMLXRayTrace &Trace) {
// A trace file contains two parts, the header and the list of all the
// trace records.
@@ -98,8 +96,6 @@ template <> struct MappingTraits<xray::YAMLXRayTrace> {
IO.mapRequired("records", Trace.Records);
}
};
-
-} // namespace yaml
} // namespace llvm
LLVM_YAML_IS_SEQUENCE_VECTOR(xray::YAMLXRayRecord)
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 45c889c..a5ba197 100755
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -2177,16 +2177,13 @@ Constant *constantFoldVectorReduce(Intrinsic::ID IID, Constant *Op) {
return PoisonValue::get(VT->getElementType());
// TODO: Handle undef.
- if (!isa<ConstantVector>(Op) && !isa<ConstantDataVector>(Op))
- return nullptr;
-
- auto *EltC = dyn_cast<ConstantInt>(Op->getAggregateElement(0U));
+ auto *EltC = dyn_cast_or_null<ConstantInt>(Op->getAggregateElement(0U));
if (!EltC)
return nullptr;
APInt Acc = EltC->getValue();
for (unsigned I = 1, E = VT->getNumElements(); I != E; I++) {
- if (!(EltC = dyn_cast<ConstantInt>(Op->getAggregateElement(I))))
+ if (!(EltC = dyn_cast_or_null<ConstantInt>(Op->getAggregateElement(I))))
return nullptr;
const APInt &X = EltC->getValue();
switch (IID) {
@@ -3059,35 +3056,25 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
Val = Val | Val << 1;
return ConstantInt::get(Ty, Val);
}
-
- default:
- return nullptr;
}
}
- switch (IntrinsicID) {
- default: break;
- case Intrinsic::vector_reduce_add:
- case Intrinsic::vector_reduce_mul:
- case Intrinsic::vector_reduce_and:
- case Intrinsic::vector_reduce_or:
- case Intrinsic::vector_reduce_xor:
- case Intrinsic::vector_reduce_smin:
- case Intrinsic::vector_reduce_smax:
- case Intrinsic::vector_reduce_umin:
- case Intrinsic::vector_reduce_umax:
- if (Constant *C = constantFoldVectorReduce(IntrinsicID, Operands[0]))
- return C;
- break;
- }
-
- // Support ConstantVector in case we have an Undef in the top.
- if (isa<ConstantVector>(Operands[0]) ||
- isa<ConstantDataVector>(Operands[0]) ||
- isa<ConstantAggregateZero>(Operands[0])) {
+ if (Operands[0]->getType()->isVectorTy()) {
auto *Op = cast<Constant>(Operands[0]);
switch (IntrinsicID) {
default: break;
+ case Intrinsic::vector_reduce_add:
+ case Intrinsic::vector_reduce_mul:
+ case Intrinsic::vector_reduce_and:
+ case Intrinsic::vector_reduce_or:
+ case Intrinsic::vector_reduce_xor:
+ case Intrinsic::vector_reduce_smin:
+ case Intrinsic::vector_reduce_smax:
+ case Intrinsic::vector_reduce_umin:
+ case Intrinsic::vector_reduce_umax:
+ if (Constant *C = constantFoldVectorReduce(IntrinsicID, Operands[0]))
+ return C;
+ break;
case Intrinsic::x86_sse_cvtss2si:
case Intrinsic::x86_sse_cvtss2si64:
case Intrinsic::x86_sse2_cvtsd2si:
@@ -3116,10 +3103,15 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
case Intrinsic::wasm_alltrue:
// Check each element individually
unsigned E = cast<FixedVectorType>(Op->getType())->getNumElements();
- for (unsigned I = 0; I != E; ++I)
- if (Constant *Elt = Op->getAggregateElement(I))
- if (Elt->isZeroValue())
- return ConstantInt::get(Ty, 0);
+ for (unsigned I = 0; I != E; ++I) {
+ Constant *Elt = Op->getAggregateElement(I);
+ // Return false as soon as we find a non-true element.
+ if (Elt && Elt->isZeroValue())
+ return ConstantInt::get(Ty, 0);
+ // Bail as soon as we find an element we cannot prove to be true.
+ if (!Elt || !isa<ConstantInt>(Elt))
+ return nullptr;
+ }
return ConstantInt::get(Ty, 1);
}
diff --git a/llvm/lib/Analysis/DXILResource.cpp b/llvm/lib/Analysis/DXILResource.cpp
index b78cc03e..f9bf092 100644
--- a/llvm/lib/Analysis/DXILResource.cpp
+++ b/llvm/lib/Analysis/DXILResource.cpp
@@ -281,6 +281,38 @@ static StructType *getOrCreateElementStruct(Type *ElemType, StringRef Name) {
return StructType::create(ElemType, Name);
}
+static Type *getTypeWithoutPadding(Type *Ty) {
+ // Recursively remove padding from structures.
+ if (auto *ST = dyn_cast<StructType>(Ty)) {
+ LLVMContext &Ctx = Ty->getContext();
+ SmallVector<Type *> ElementTypes;
+ ElementTypes.reserve(ST->getNumElements());
+ for (Type *ElTy : ST->elements()) {
+ if (isa<PaddingExtType>(ElTy))
+ continue;
+ ElementTypes.push_back(getTypeWithoutPadding(ElTy));
+ }
+
+ // Handle explicitly padded cbuffer arrays like { [ n x paddedty ], ty }
+ if (ElementTypes.size() == 2)
+ if (auto *AT = dyn_cast<ArrayType>(ElementTypes[0]))
+ if (ElementTypes[1] == AT->getElementType())
+ return ArrayType::get(ElementTypes[1], AT->getNumElements() + 1);
+
+ // If we only have a single element, don't wrap it in a struct.
+ if (ElementTypes.size() == 1)
+ return ElementTypes[0];
+
+ return StructType::get(Ctx, ElementTypes, /*IsPacked=*/false);
+ }
+ // Arrays just need to have their element type adjusted.
+ if (auto *AT = dyn_cast<ArrayType>(Ty))
+ return ArrayType::get(getTypeWithoutPadding(AT->getElementType()),
+ AT->getNumElements());
+ // Anything else should be good as is.
+ return Ty;
+}
+
StructType *ResourceTypeInfo::createElementStruct(StringRef CBufferName) {
SmallString<64> TypeName;
@@ -334,14 +366,21 @@ StructType *ResourceTypeInfo::createElementStruct(StringRef CBufferName) {
}
case ResourceKind::CBuffer: {
auto *RTy = cast<CBufferExtType>(HandleTy);
- LayoutExtType *LayoutType = cast<LayoutExtType>(RTy->getResourceType());
- StructType *Ty = cast<StructType>(LayoutType->getWrappedType());
SmallString<64> Name = getResourceKindName(Kind);
if (!CBufferName.empty()) {
Name.append(".");
Name.append(CBufferName);
}
- return StructType::create(Ty->elements(), Name);
+
+ // TODO: Remove this when we update the frontend to use explicit padding.
+ if (LayoutExtType *LayoutType =
+ dyn_cast<LayoutExtType>(RTy->getResourceType())) {
+ StructType *Ty = cast<StructType>(LayoutType->getWrappedType());
+ return StructType::create(Ty->elements(), Name);
+ }
+
+ return getOrCreateElementStruct(
+ getTypeWithoutPadding(RTy->getResourceType()), Name);
}
case ResourceKind::Sampler: {
auto *RTy = cast<SamplerExtType>(HandleTy);
@@ -454,10 +493,10 @@ uint32_t ResourceTypeInfo::getCBufferSize(const DataLayout &DL) const {
Type *ElTy = cast<CBufferExtType>(HandleTy)->getResourceType();
+ // TODO: Remove this when we update the frontend to use explicit padding.
if (auto *LayoutTy = dyn_cast<LayoutExtType>(ElTy))
return LayoutTy->getSize();
- // TODO: What should we do with unannotated arrays?
return DL.getTypeAllocSize(ElTy);
}
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index 8d20b0e..805b682 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -1180,32 +1180,41 @@ bool DependenceInfo::isKnownLessThan(const SCEV *S, const SCEV *Size) const {
S = SE->getTruncateOrZeroExtend(S, MaxType);
Size = SE->getTruncateOrZeroExtend(Size, MaxType);
- // Special check for addrecs using BE taken count
- if (const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(S))
- if (AddRec->isAffine() && AddRec->hasNoSignedWrap()) {
- const SCEV *BECount = SE->getBackedgeTakenCount(AddRec->getLoop());
- const SCEV *Start = AddRec->getStart();
- const SCEV *Step = AddRec->getStepRecurrence(*SE);
- const SCEV *End = AddRec->evaluateAtIteration(BECount, *SE);
- const SCEV *Diff0 = SE->getMinusSCEV(Start, Size);
- const SCEV *Diff1 = SE->getMinusSCEV(End, Size);
-
- // If the value of Step is non-negative and the AddRec is non-wrap, it
- // reaches its maximum at the last iteration. So it's enouth to check
- // whether End - Size is negative.
- if (SE->isKnownNonNegative(Step) && SE->isKnownNegative(Diff1))
- return true;
+ auto CheckAddRecBECount = [&]() {
+ const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(S);
+ if (!AddRec || !AddRec->isAffine() || !AddRec->hasNoSignedWrap())
+ return false;
+ const SCEV *BECount = collectUpperBound(AddRec->getLoop(), MaxType);
+ // If the BTC cannot be computed, check the base case for S.
+ if (!BECount || isa<SCEVCouldNotCompute>(BECount))
+ return false;
+ const SCEV *Start = AddRec->getStart();
+ const SCEV *Step = AddRec->getStepRecurrence(*SE);
+ const SCEV *End = AddRec->evaluateAtIteration(BECount, *SE);
+ const SCEV *Diff0 = SE->getMinusSCEV(Start, Size);
+ const SCEV *Diff1 = SE->getMinusSCEV(End, Size);
+
+ // If the value of Step is non-negative and the AddRec is non-wrap, it
+ // reaches its maximum at the last iteration. So it's enouth to check
+ // whether End - Size is negative.
+ if (SE->isKnownNonNegative(Step) && SE->isKnownNegative(Diff1))
+ return true;
- // If the value of Step is non-positive and the AddRec is non-wrap, the
- // initial value is its maximum.
- if (SE->isKnownNonPositive(Step) && SE->isKnownNegative(Diff0))
- return true;
+ // If the value of Step is non-positive and the AddRec is non-wrap, the
+ // initial value is its maximum.
+ if (SE->isKnownNonPositive(Step) && SE->isKnownNegative(Diff0))
+ return true;
- // Even if we don't know the sign of Step, either Start or End must be
- // the maximum value of the AddRec since it is non-wrap.
- if (SE->isKnownNegative(Diff0) && SE->isKnownNegative(Diff1))
- return true;
- }
+ // Even if we don't know the sign of Step, either Start or End must be
+ // the maximum value of the AddRec since it is non-wrap.
+ if (SE->isKnownNegative(Diff0) && SE->isKnownNegative(Diff1))
+ return true;
+
+ return false;
+ };
+
+ if (CheckAddRecBECount())
+ return true;
// Check using normal isKnownNegative
const SCEV *LimitedBound = SE->getMinusSCEV(S, Size);
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index b8c540c..9f8ac6e 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -849,17 +849,12 @@ RecurrenceDescriptor::isMinMaxPattern(Instruction *I, RecurKind Kind,
/// %sum.2 = select %cmp, %add, %sum.1
RecurrenceDescriptor::InstDesc
RecurrenceDescriptor::isConditionalRdxPattern(Instruction *I) {
- SelectInst *SI = dyn_cast<SelectInst>(I);
- if (!SI)
- return InstDesc(false, I);
-
- CmpInst *CI = dyn_cast<CmpInst>(SI->getCondition());
+ Value *TrueVal, *FalseVal;
// Only handle single use cases for now.
- if (!CI || !CI->hasOneUse())
+ if (!match(I,
+ m_Select(m_OneUse(m_Cmp()), m_Value(TrueVal), m_Value(FalseVal))))
return InstDesc(false, I);
- Value *TrueVal = SI->getTrueValue();
- Value *FalseVal = SI->getFalseValue();
// Handle only when either of operands of select instruction is a PHI
// node for now.
if ((isa<PHINode>(TrueVal) && isa<PHINode>(FalseVal)) ||
@@ -886,7 +881,7 @@ RecurrenceDescriptor::isConditionalRdxPattern(Instruction *I) {
if (!IPhi || IPhi != FalseVal)
return InstDesc(false, I);
- return InstDesc(true, SI);
+ return InstDesc(true, I);
}
RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr(
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 4e38626..e08ef60 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -6644,7 +6644,7 @@ Value *llvm::simplifyBinaryIntrinsic(Intrinsic::ID IID, Type *ReturnType,
"Invalid mask width");
// If index-width (mask size) is less than pointer-size then mask is
// 1-extended.
- if (match(Op1, m_PtrToInt(m_Specific(Op0))))
+ if (match(Op1, m_PtrToIntOrAddr(m_Specific(Op0))))
return Op0;
// NOTE: We may have attributes associated with the return value of the
diff --git a/llvm/lib/Analysis/LoopInfo.cpp b/llvm/lib/Analysis/LoopInfo.cpp
index a8c3173..d84721b 100644
--- a/llvm/lib/Analysis/LoopInfo.cpp
+++ b/llvm/lib/Analysis/LoopInfo.cpp
@@ -986,8 +986,8 @@ PreservedAnalyses LoopPrinterPass::run(Function &F,
return PreservedAnalyses::all();
}
-void llvm::printLoop(Loop &L, raw_ostream &OS, const std::string &Banner) {
-
+void llvm::printLoop(const Loop &L, raw_ostream &OS,
+ const std::string &Banner) {
if (forcePrintModuleIR()) {
// handling -print-module-scope
OS << Banner << " (loop: ";
diff --git a/llvm/lib/Analysis/MLInlineAdvisor.cpp b/llvm/lib/Analysis/MLInlineAdvisor.cpp
index f90717d..1d1a5560 100644
--- a/llvm/lib/Analysis/MLInlineAdvisor.cpp
+++ b/llvm/lib/Analysis/MLInlineAdvisor.cpp
@@ -61,6 +61,9 @@ static cl::opt<SkipMLPolicyCriteria> SkipPolicy(
static cl::opt<std::string> ModelSelector("ml-inliner-model-selector",
cl::Hidden, cl::init(""));
+static cl::opt<bool> StopImmediatelyForTest("ml-inliner-stop-immediately",
+ cl::Hidden);
+
#if defined(LLVM_HAVE_TF_AOT_INLINERSIZEMODEL)
// codegen-ed file
#include "InlinerSizeModel.h" // NOLINT
@@ -214,6 +217,7 @@ MLInlineAdvisor::MLInlineAdvisor(
return;
}
ModelRunner->switchContext("");
+ ForceStop = StopImmediatelyForTest;
}
unsigned MLInlineAdvisor::getInitialFunctionLevel(const Function &F) const {
@@ -379,9 +383,17 @@ std::unique_ptr<InlineAdvice> MLInlineAdvisor::getAdviceImpl(CallBase &CB) {
auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(Caller);
if (SkipPolicy == SkipMLPolicyCriteria::IfCallerIsNotCold) {
- if (!PSI.isFunctionEntryCold(&Caller))
- return std::make_unique<InlineAdvice>(this, CB, ORE,
- GetDefaultAdvice(CB));
+ if (!PSI.isFunctionEntryCold(&Caller)) {
+ // Return a MLInlineAdvice, despite delegating to the default advice,
+ // because we need to keep track of the internal state. This is different
+ // from the other instances where we return a "default" InlineAdvice,
+ // which happen at points we won't come back to the MLAdvisor for
+ // decisions requiring that state.
+ return ForceStop ? std::make_unique<InlineAdvice>(this, CB, ORE,
+ GetDefaultAdvice(CB))
+ : std::make_unique<MLInlineAdvice>(this, CB, ORE,
+ GetDefaultAdvice(CB));
+ }
}
auto MandatoryKind = InlineAdvisor::getMandatoryKind(CB, FAM, ORE);
// If this is a "never inline" case, there won't be any changes to internal
diff --git a/llvm/lib/Analysis/MemorySSA.cpp b/llvm/lib/Analysis/MemorySSA.cpp
index ab37338..0b2e3fc 100644
--- a/llvm/lib/Analysis/MemorySSA.cpp
+++ b/llvm/lib/Analysis/MemorySSA.cpp
@@ -393,7 +393,7 @@ static bool isUseTriviallyOptimizableToLiveOnEntry(AliasAnalysisType &AA,
/// \param AA The AliasAnalysis we used for our search.
/// \param AllowImpreciseClobber Always false, unless we do relaxed verify.
-LLVM_ATTRIBUTE_UNUSED static void
+[[maybe_unused]] static void
checkClobberSanity(const MemoryAccess *Start, MemoryAccess *ClobberAt,
const MemoryLocation &StartLoc, const MemorySSA &MSSA,
const UpwardsMemoryQuery &Query, BatchAAResults &AA,
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index b5b4cd9..e06b095 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -1774,7 +1774,7 @@ const SCEV *ScalarEvolution::getZeroExtendExprImpl(const SCEV *Op, Type *Ty,
{
const SCEV *LHS;
const SCEV *RHS;
- if (matchURem(Op, LHS, RHS))
+ if (match(Op, m_scev_URem(m_SCEV(LHS), m_SCEV(RHS), *this)))
return getURemExpr(getZeroExtendExpr(LHS, Ty, Depth + 1),
getZeroExtendExpr(RHS, Ty, Depth + 1));
}
@@ -1840,19 +1840,19 @@ const SCEV *ScalarEvolution::getZeroExtendExprImpl(const SCEV *Op, Type *Ty,
// = zext((2^K * (trunc X to i{N-K}))<nuw>) to iM
// = (2^K * (zext(trunc X to i{N-K}) to iM))<nuw>.
//
- if (SM->getNumOperands() == 2)
- if (auto *MulLHS = dyn_cast<SCEVConstant>(SM->getOperand(0)))
- if (MulLHS->getAPInt().isPowerOf2())
- if (auto *TruncRHS = dyn_cast<SCEVTruncateExpr>(SM->getOperand(1))) {
- int NewTruncBits = getTypeSizeInBits(TruncRHS->getType()) -
- MulLHS->getAPInt().logBase2();
- Type *NewTruncTy = IntegerType::get(getContext(), NewTruncBits);
- return getMulExpr(
- getZeroExtendExpr(MulLHS, Ty),
- getZeroExtendExpr(
- getTruncateExpr(TruncRHS->getOperand(), NewTruncTy), Ty),
- SCEV::FlagNUW, Depth + 1);
- }
+ const APInt *C;
+ const SCEV *TruncRHS;
+ if (match(SM,
+ m_scev_Mul(m_scev_APInt(C), m_scev_Trunc(m_SCEV(TruncRHS)))) &&
+ C->isPowerOf2()) {
+ int NewTruncBits =
+ getTypeSizeInBits(SM->getOperand(1)->getType()) - C->logBase2();
+ Type *NewTruncTy = IntegerType::get(getContext(), NewTruncBits);
+ return getMulExpr(
+ getZeroExtendExpr(SM->getOperand(0), Ty),
+ getZeroExtendExpr(getTruncateExpr(TruncRHS, NewTruncTy), Ty),
+ SCEV::FlagNUW, Depth + 1);
+ }
}
// zext(umin(x, y)) -> umin(zext(x), zext(y))
@@ -2699,17 +2699,12 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
}
// Canonicalize (-1 * urem X, Y) + X --> (Y * X/Y)
- if (Ops.size() == 2) {
- const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(Ops[0]);
- if (Mul && Mul->getNumOperands() == 2 &&
- Mul->getOperand(0)->isAllOnesValue()) {
- const SCEV *X;
- const SCEV *Y;
- if (matchURem(Mul->getOperand(1), X, Y) && X == Ops[1]) {
- return getMulExpr(Y, getUDivExpr(X, Y));
- }
- }
- }
+ const SCEV *Y;
+ if (Ops.size() == 2 &&
+ match(Ops[0],
+ m_scev_Mul(m_scev_AllOnes(),
+ m_scev_URem(m_scev_Specific(Ops[1]), m_SCEV(Y), *this))))
+ return getMulExpr(Y, getUDivExpr(Ops[1], Y));
// Skip past any other cast SCEVs.
while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scAddExpr)
@@ -3149,20 +3144,19 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
if (const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(Ops[0])) {
if (Ops.size() == 2) {
// C1*(C2+V) -> C1*C2 + C1*V
- if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Ops[1]))
- // If any of Add's ops are Adds or Muls with a constant, apply this
- // transformation as well.
- //
- // TODO: There are some cases where this transformation is not
- // profitable; for example, Add = (C0 + X) * Y + Z. Maybe the scope of
- // this transformation should be narrowed down.
- if (Add->getNumOperands() == 2 && containsConstantInAddMulChain(Add)) {
- const SCEV *LHS = getMulExpr(LHSC, Add->getOperand(0),
- SCEV::FlagAnyWrap, Depth + 1);
- const SCEV *RHS = getMulExpr(LHSC, Add->getOperand(1),
- SCEV::FlagAnyWrap, Depth + 1);
- return getAddExpr(LHS, RHS, SCEV::FlagAnyWrap, Depth + 1);
- }
+ // If any of Add's ops are Adds or Muls with a constant, apply this
+ // transformation as well.
+ //
+ // TODO: There are some cases where this transformation is not
+ // profitable; for example, Add = (C0 + X) * Y + Z. Maybe the scope of
+ // this transformation should be narrowed down.
+ const SCEV *Op0, *Op1;
+ if (match(Ops[1], m_scev_Add(m_SCEV(Op0), m_SCEV(Op1))) &&
+ containsConstantInAddMulChain(Ops[1])) {
+ const SCEV *LHS = getMulExpr(LHSC, Op0, SCEV::FlagAnyWrap, Depth + 1);
+ const SCEV *RHS = getMulExpr(LHSC, Op1, SCEV::FlagAnyWrap, Depth + 1);
+ return getAddExpr(LHS, RHS, SCEV::FlagAnyWrap, Depth + 1);
+ }
if (Ops[0]->isAllOnesValue()) {
// If we have a mul by -1 of an add, try distributing the -1 among the
@@ -3583,20 +3577,12 @@ const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS,
}
// ((-C + (C smax %x)) /u %x) evaluates to zero, for any positive constant C.
- if (const auto *AE = dyn_cast<SCEVAddExpr>(LHS);
- AE && AE->getNumOperands() == 2) {
- if (const auto *VC = dyn_cast<SCEVConstant>(AE->getOperand(0))) {
- const APInt &NegC = VC->getAPInt();
- if (NegC.isNegative() && !NegC.isMinSignedValue()) {
- const auto *MME = dyn_cast<SCEVSMaxExpr>(AE->getOperand(1));
- if (MME && MME->getNumOperands() == 2 &&
- isa<SCEVConstant>(MME->getOperand(0)) &&
- cast<SCEVConstant>(MME->getOperand(0))->getAPInt() == -NegC &&
- MME->getOperand(1) == RHS)
- return getZero(LHS->getType());
- }
- }
- }
+ const APInt *NegC, *C;
+ if (match(LHS,
+ m_scev_Add(m_scev_APInt(NegC),
+ m_scev_SMax(m_scev_APInt(C), m_scev_Specific(RHS)))) &&
+ NegC->isNegative() && !NegC->isMinSignedValue() && *C == -*NegC)
+ return getZero(LHS->getType());
// TODO: Generalize to handle any common factors.
// udiv (mul nuw a, vscale), (mul nuw b, vscale) --> udiv a, b
@@ -4628,17 +4614,11 @@ const SCEV *ScalarEvolution::getNegativeSCEV(const SCEV *V,
/// If Expr computes ~A, return A else return nullptr
static const SCEV *MatchNotExpr(const SCEV *Expr) {
- const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Expr);
- if (!Add || Add->getNumOperands() != 2 ||
- !Add->getOperand(0)->isAllOnesValue())
- return nullptr;
-
- const SCEVMulExpr *AddRHS = dyn_cast<SCEVMulExpr>(Add->getOperand(1));
- if (!AddRHS || AddRHS->getNumOperands() != 2 ||
- !AddRHS->getOperand(0)->isAllOnesValue())
- return nullptr;
-
- return AddRHS->getOperand(1);
+ const SCEV *MulOp;
+ if (match(Expr, m_scev_Add(m_scev_AllOnes(),
+ m_scev_Mul(m_scev_AllOnes(), m_SCEV(MulOp)))))
+ return MulOp;
+ return nullptr;
}
/// Return a SCEV corresponding to ~V = -1-V
@@ -5419,20 +5399,15 @@ static Type *isSimpleCastedPHI(const SCEV *Op, const SCEVUnknown *SymbolicPHI,
if (SourceBits != NewBits)
return nullptr;
- const SCEVSignExtendExpr *SExt = dyn_cast<SCEVSignExtendExpr>(Op);
- const SCEVZeroExtendExpr *ZExt = dyn_cast<SCEVZeroExtendExpr>(Op);
- if (!SExt && !ZExt)
- return nullptr;
- const SCEVTruncateExpr *Trunc =
- SExt ? dyn_cast<SCEVTruncateExpr>(SExt->getOperand())
- : dyn_cast<SCEVTruncateExpr>(ZExt->getOperand());
- if (!Trunc)
- return nullptr;
- const SCEV *X = Trunc->getOperand();
- if (X != SymbolicPHI)
- return nullptr;
- Signed = SExt != nullptr;
- return Trunc->getType();
+ if (match(Op, m_scev_SExt(m_scev_Trunc(m_scev_Specific(SymbolicPHI))))) {
+ Signed = true;
+ return cast<SCEVCastExpr>(Op)->getOperand()->getType();
+ }
+ if (match(Op, m_scev_ZExt(m_scev_Trunc(m_scev_Specific(SymbolicPHI))))) {
+ Signed = false;
+ return cast<SCEVCastExpr>(Op)->getOperand()->getType();
+ }
+ return nullptr;
}
static const Loop *isIntegerLoopHeaderPHI(const PHINode *PN, LoopInfo &LI) {
@@ -6427,8 +6402,18 @@ APInt ScalarEvolution::getConstantMultipleImpl(const SCEV *S,
case scSequentialUMinExpr:
return GetGCDMultiple(cast<SCEVNAryExpr>(S));
case scUnknown: {
- // ask ValueTracking for known bits
+ // Ask ValueTracking for known bits. SCEVUnknown only become available at
+ // the point their underlying IR instruction has been defined. If CtxI was
+ // not provided, use:
+ // * the first instruction in the entry block if it is an argument
+ // * the instruction itself otherwise.
const SCEVUnknown *U = cast<SCEVUnknown>(S);
+ if (!CtxI) {
+ if (isa<Argument>(U->getValue()))
+ CtxI = &*F.getEntryBlock().begin();
+ else if (auto *I = dyn_cast<Instruction>(U->getValue()))
+ CtxI = I;
+ }
unsigned Known =
computeKnownBits(U->getValue(), getDataLayout(), &AC, CtxI, &DT)
.countMinTrailingZeros();
@@ -10797,19 +10782,15 @@ static bool HasSameValue(const SCEV *A, const SCEV *B) {
}
static bool MatchBinarySub(const SCEV *S, const SCEV *&LHS, const SCEV *&RHS) {
- const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S);
- if (!Add || Add->getNumOperands() != 2)
+ const SCEV *Op0, *Op1;
+ if (!match(S, m_scev_Add(m_SCEV(Op0), m_SCEV(Op1))))
return false;
- if (auto *ME = dyn_cast<SCEVMulExpr>(Add->getOperand(0));
- ME && ME->getNumOperands() == 2 && ME->getOperand(0)->isAllOnesValue()) {
- LHS = Add->getOperand(1);
- RHS = ME->getOperand(1);
+ if (match(Op0, m_scev_Mul(m_scev_AllOnes(), m_SCEV(RHS)))) {
+ LHS = Op1;
return true;
}
- if (auto *ME = dyn_cast<SCEVMulExpr>(Add->getOperand(1));
- ME && ME->getNumOperands() == 2 && ME->getOperand(0)->isAllOnesValue()) {
- LHS = Add->getOperand(0);
- RHS = ME->getOperand(1);
+ if (match(Op1, m_scev_Mul(m_scev_AllOnes(), m_SCEV(RHS)))) {
+ LHS = Op0;
return true;
}
return false;
@@ -12172,13 +12153,10 @@ bool ScalarEvolution::isImpliedCondBalancedTypes(
bool ScalarEvolution::splitBinaryAdd(const SCEV *Expr,
const SCEV *&L, const SCEV *&R,
SCEV::NoWrapFlags &Flags) {
- const auto *AE = dyn_cast<SCEVAddExpr>(Expr);
- if (!AE || AE->getNumOperands() != 2)
+ if (!match(Expr, m_scev_Add(m_SCEV(L), m_SCEV(R))))
return false;
- L = AE->getOperand(0);
- R = AE->getOperand(1);
- Flags = AE->getNoWrapFlags();
+ Flags = cast<SCEVAddExpr>(Expr)->getNoWrapFlags();
return true;
}
@@ -12220,12 +12198,11 @@ ScalarEvolution::computeConstantDifference(const SCEV *More, const SCEV *Less) {
// Try to match a common constant multiply.
auto MatchConstMul =
[](const SCEV *S) -> std::optional<std::pair<const SCEV *, APInt>> {
- auto *M = dyn_cast<SCEVMulExpr>(S);
- if (!M || M->getNumOperands() != 2 ||
- !isa<SCEVConstant>(M->getOperand(0)))
- return std::nullopt;
- return {
- {M->getOperand(1), cast<SCEVConstant>(M->getOperand(0))->getAPInt()}};
+ const APInt *C;
+ const SCEV *Op;
+ if (match(S, m_scev_Mul(m_scev_APInt(C), m_SCEV(Op))))
+ return {{Op, *C}};
+ return std::nullopt;
};
if (auto MatchedMore = MatchConstMul(More)) {
if (auto MatchedLess = MatchConstMul(Less)) {
@@ -15415,67 +15392,6 @@ void PredicatedScalarEvolution::print(raw_ostream &OS, unsigned Depth) const {
}
}
-// Match the mathematical pattern A - (A / B) * B, where A and B can be
-// arbitrary expressions. Also match zext (trunc A to iB) to iY, which is used
-// for URem with constant power-of-2 second operands.
-// It's not always easy, as A and B can be folded (imagine A is X / 2, and B is
-// 4, A / B becomes X / 8).
-bool ScalarEvolution::matchURem(const SCEV *Expr, const SCEV *&LHS,
- const SCEV *&RHS) {
- if (Expr->getType()->isPointerTy())
- return false;
-
- // Try to match 'zext (trunc A to iB) to iY', which is used
- // for URem with constant power-of-2 second operands. Make sure the size of
- // the operand A matches the size of the whole expressions.
- if (const auto *ZExt = dyn_cast<SCEVZeroExtendExpr>(Expr))
- if (const auto *Trunc = dyn_cast<SCEVTruncateExpr>(ZExt->getOperand(0))) {
- LHS = Trunc->getOperand();
- // Bail out if the type of the LHS is larger than the type of the
- // expression for now.
- if (getTypeSizeInBits(LHS->getType()) >
- getTypeSizeInBits(Expr->getType()))
- return false;
- if (LHS->getType() != Expr->getType())
- LHS = getZeroExtendExpr(LHS, Expr->getType());
- RHS = getConstant(APInt(getTypeSizeInBits(Expr->getType()), 1)
- << getTypeSizeInBits(Trunc->getType()));
- return true;
- }
- const auto *Add = dyn_cast<SCEVAddExpr>(Expr);
- if (Add == nullptr || Add->getNumOperands() != 2)
- return false;
-
- const SCEV *A = Add->getOperand(1);
- const auto *Mul = dyn_cast<SCEVMulExpr>(Add->getOperand(0));
-
- if (Mul == nullptr)
- return false;
-
- const auto MatchURemWithDivisor = [&](const SCEV *B) {
- // (SomeExpr + (-(SomeExpr / B) * B)).
- if (Expr == getURemExpr(A, B)) {
- LHS = A;
- RHS = B;
- return true;
- }
- return false;
- };
-
- // (SomeExpr + (-1 * (SomeExpr / B) * B)).
- if (Mul->getNumOperands() == 3 && isa<SCEVConstant>(Mul->getOperand(0)))
- return MatchURemWithDivisor(Mul->getOperand(1)) ||
- MatchURemWithDivisor(Mul->getOperand(2));
-
- // (SomeExpr + ((-SomeExpr / B) * B)) or (SomeExpr + ((SomeExpr / B) * -B)).
- if (Mul->getNumOperands() == 2)
- return MatchURemWithDivisor(Mul->getOperand(1)) ||
- MatchURemWithDivisor(Mul->getOperand(0)) ||
- MatchURemWithDivisor(getNegativeSCEV(Mul->getOperand(1))) ||
- MatchURemWithDivisor(getNegativeSCEV(Mul->getOperand(0)));
- return false;
-}
-
ScalarEvolution::LoopGuards
ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) {
BasicBlock *Header = L->getHeader();
@@ -15618,19 +15534,10 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
auto IsMinMaxSCEVWithNonNegativeConstant =
[&](const SCEV *Expr, SCEVTypes &SCTy, const SCEV *&LHS,
const SCEV *&RHS) {
- if (auto *MinMax = dyn_cast<SCEVMinMaxExpr>(Expr)) {
- if (MinMax->getNumOperands() != 2)
- return false;
- if (auto *C = dyn_cast<SCEVConstant>(MinMax->getOperand(0))) {
- if (C->getAPInt().isNegative())
- return false;
- SCTy = MinMax->getSCEVType();
- LHS = MinMax->getOperand(0);
- RHS = MinMax->getOperand(1);
- return true;
- }
- }
- return false;
+ const APInt *C;
+ SCTy = Expr->getSCEVType();
+ return match(Expr, m_scev_MinMax(m_SCEV(LHS), m_SCEV(RHS))) &&
+ match(LHS, m_scev_APInt(C)) && C->isNonNegative();
};
// Return a new SCEV that modifies \p Expr to the closest number divides by
@@ -15696,20 +15603,18 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
if (Predicate == CmpInst::ICMP_EQ && match(RHS, m_scev_Zero())) {
// If LHS is A % B, i.e. A % B == 0, rewrite A to (A /u B) * B to
// explicitly express that.
- const SCEV *URemLHS = nullptr;
+ const SCEVUnknown *URemLHS = nullptr;
const SCEV *URemRHS = nullptr;
- if (SE.matchURem(LHS, URemLHS, URemRHS)) {
- if (const SCEVUnknown *LHSUnknown = dyn_cast<SCEVUnknown>(URemLHS)) {
- auto I = RewriteMap.find(LHSUnknown);
- const SCEV *RewrittenLHS =
- I != RewriteMap.end() ? I->second : LHSUnknown;
- RewrittenLHS = ApplyDivisibiltyOnMinMaxExpr(RewrittenLHS, URemRHS);
- const auto *Multiple =
- SE.getMulExpr(SE.getUDivExpr(RewrittenLHS, URemRHS), URemRHS);
- RewriteMap[LHSUnknown] = Multiple;
- ExprsToRewrite.push_back(LHSUnknown);
- return;
- }
+ if (match(LHS,
+ m_scev_URem(m_SCEVUnknown(URemLHS), m_SCEV(URemRHS), SE))) {
+ auto I = RewriteMap.find(URemLHS);
+ const SCEV *RewrittenLHS = I != RewriteMap.end() ? I->second : URemLHS;
+ RewrittenLHS = ApplyDivisibiltyOnMinMaxExpr(RewrittenLHS, URemRHS);
+ const auto *Multiple =
+ SE.getMulExpr(SE.getUDivExpr(RewrittenLHS, URemRHS), URemRHS);
+ RewriteMap[URemLHS] = Multiple;
+ ExprsToRewrite.push_back(URemLHS);
+ return;
}
}
@@ -15834,6 +15739,21 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
const SCEV *OneAlignedUp =
GetNextSCEVDividesByDivisor(One, DividesBy);
To = SE.getUMaxExpr(FromRewritten, OneAlignedUp);
+ } else {
+ if (LHS->getType()->isPointerTy()) {
+ LHS = SE.getLosslessPtrToIntExpr(LHS);
+ RHS = SE.getLosslessPtrToIntExpr(RHS);
+ if (isa<SCEVCouldNotCompute>(LHS) || isa<SCEVCouldNotCompute>(RHS))
+ break;
+ }
+ auto AddSubRewrite = [&](const SCEV *A, const SCEV *B) {
+ const SCEV *Sub = SE.getMinusSCEV(A, B);
+ AddRewrite(Sub, Sub,
+ SE.getUMaxExpr(Sub, SE.getOne(From->getType())));
+ };
+ AddSubRewrite(LHS, RHS);
+ AddSubRewrite(RHS, LHS);
+ continue;
}
break;
default:
diff --git a/llvm/lib/Analysis/StaticDataProfileInfo.cpp b/llvm/lib/Analysis/StaticDataProfileInfo.cpp
index b036b2d..61d4935 100644
--- a/llvm/lib/Analysis/StaticDataProfileInfo.cpp
+++ b/llvm/lib/Analysis/StaticDataProfileInfo.cpp
@@ -1,11 +1,55 @@
#include "llvm/Analysis/StaticDataProfileInfo.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Module.h"
#include "llvm/InitializePasses.h"
#include "llvm/ProfileData/InstrProf.h"
+#define DEBUG_TYPE "static-data-profile-info"
+
using namespace llvm;
+
+namespace llvm {
+namespace memprof {
+// Returns true iff the global variable has custom section either by
+// __attribute__((section("name")))
+// (https://clang.llvm.org/docs/AttributeReference.html#section-declspec-allocate)
+// or #pragma clang section directives
+// (https://clang.llvm.org/docs/LanguageExtensions.html#specifying-section-names-for-global-objects-pragma-clang-section).
+static bool hasExplicitSectionName(const GlobalVariable &GVar) {
+ if (GVar.hasSection())
+ return true;
+
+ auto Attrs = GVar.getAttributes();
+ if (Attrs.hasAttribute("bss-section") || Attrs.hasAttribute("data-section") ||
+ Attrs.hasAttribute("relro-section") ||
+ Attrs.hasAttribute("rodata-section"))
+ return true;
+ return false;
+}
+
+AnnotationKind getAnnotationKind(const GlobalVariable &GV) {
+ if (GV.isDeclarationForLinker())
+ return AnnotationKind::DeclForLinker;
+ // Skip 'llvm.'-prefixed global variables conservatively because they are
+ // often handled specially,
+ StringRef Name = GV.getName();
+ if (Name.starts_with("llvm."))
+ return AnnotationKind::ReservedName;
+ // Respect user-specified custom data sections.
+ if (hasExplicitSectionName(GV))
+ return AnnotationKind::ExplicitSection;
+ return AnnotationKind::AnnotationOK;
+}
+
+bool IsAnnotationOK(const GlobalVariable &GV) {
+ return getAnnotationKind(GV) == AnnotationKind::AnnotationOK;
+}
+} // namespace memprof
+} // namespace llvm
+
void StaticDataProfileInfo::addConstantProfileCount(
const Constant *C, std::optional<uint64_t> Count) {
if (!Count) {
@@ -20,6 +64,47 @@ void StaticDataProfileInfo::addConstantProfileCount(
OriginalCount = getInstrMaxCountValue();
}
+StaticDataProfileInfo::StaticDataHotness
+StaticDataProfileInfo::getConstantHotnessUsingProfileCount(
+ const Constant *C, const ProfileSummaryInfo *PSI, uint64_t Count) const {
+ // The accummulated counter shows the constant is hot. Return enum 'hot'
+ // whether this variable is seen by unprofiled functions or not.
+ if (PSI->isHotCount(Count))
+ return StaticDataHotness::Hot;
+ // The constant is not hot, and seen by unprofiled functions. We don't want to
+ // assign it to unlikely sections, even if the counter says 'cold'. So return
+ // enum 'LukewarmOrUnknown'.
+ if (ConstantWithoutCounts.count(C))
+ return StaticDataHotness::LukewarmOrUnknown;
+ // The accummulated counter shows the constant is cold so return enum 'cold'.
+ if (PSI->isColdCount(Count))
+ return StaticDataHotness::Cold;
+
+ return StaticDataHotness::LukewarmOrUnknown;
+}
+
+StaticDataProfileInfo::StaticDataHotness
+StaticDataProfileInfo::getSectionHotnessUsingDataAccessProfile(
+ std::optional<StringRef> MaybeSectionPrefix) const {
+ if (!MaybeSectionPrefix)
+ return StaticDataHotness::LukewarmOrUnknown;
+ StringRef Prefix = *MaybeSectionPrefix;
+ assert((Prefix == "hot" || Prefix == "unlikely") &&
+ "Expect section_prefix to be one of hot or unlikely");
+ return Prefix == "hot" ? StaticDataHotness::Hot : StaticDataHotness::Cold;
+}
+
+StringRef StaticDataProfileInfo::hotnessToStr(StaticDataHotness Hotness) const {
+ switch (Hotness) {
+ case StaticDataHotness::Cold:
+ return "unlikely";
+ case StaticDataHotness::Hot:
+ return "hot";
+ default:
+ return "";
+ }
+}
+
std::optional<uint64_t>
StaticDataProfileInfo::getConstantProfileCount(const Constant *C) const {
auto I = ConstantProfileCounts.find(C);
@@ -30,27 +115,67 @@ StaticDataProfileInfo::getConstantProfileCount(const Constant *C) const {
StringRef StaticDataProfileInfo::getConstantSectionPrefix(
const Constant *C, const ProfileSummaryInfo *PSI) const {
- auto Count = getConstantProfileCount(C);
+ std::optional<uint64_t> Count = getConstantProfileCount(C);
+
+#ifndef NDEBUG
+ auto DbgPrintPrefix = [](StringRef Prefix) {
+ return Prefix.empty() ? "<empty>" : Prefix;
+ };
+#endif
+
+ if (EnableDataAccessProf) {
+ // Module flag `HasDataAccessProf` is 1 -> empty section prefix means
+ // unknown hotness except for string literals.
+ if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(C);
+ GV && llvm::memprof::IsAnnotationOK(*GV) &&
+ !GV->getName().starts_with(".str")) {
+ auto HotnessFromDataAccessProf =
+ getSectionHotnessUsingDataAccessProfile(GV->getSectionPrefix());
+
+ if (!Count) {
+ StringRef Prefix = hotnessToStr(HotnessFromDataAccessProf);
+ LLVM_DEBUG(dbgs() << GV->getName() << " has section prefix "
+ << DbgPrintPrefix(Prefix)
+ << ", solely from data access profiles\n");
+ return Prefix;
+ }
+
+ // Both data access profiles and PGO counters are available. Use the
+ // hotter one.
+ auto HotnessFromPGO = getConstantHotnessUsingProfileCount(C, PSI, *Count);
+ StaticDataHotness GlobalVarHotness = StaticDataHotness::LukewarmOrUnknown;
+ if (HotnessFromDataAccessProf == StaticDataHotness::Hot ||
+ HotnessFromPGO == StaticDataHotness::Hot) {
+ GlobalVarHotness = StaticDataHotness::Hot;
+ } else if (HotnessFromDataAccessProf ==
+ StaticDataHotness::LukewarmOrUnknown ||
+ HotnessFromPGO == StaticDataHotness::LukewarmOrUnknown) {
+ GlobalVarHotness = StaticDataHotness::LukewarmOrUnknown;
+ } else {
+ GlobalVarHotness = StaticDataHotness::Cold;
+ }
+ StringRef Prefix = hotnessToStr(GlobalVarHotness);
+ LLVM_DEBUG(
+ dbgs() << GV->getName() << " has section prefix "
+ << DbgPrintPrefix(Prefix)
+ << ", the max from data access profiles as "
+ << DbgPrintPrefix(hotnessToStr(HotnessFromDataAccessProf))
+ << " and PGO counters as "
+ << DbgPrintPrefix(hotnessToStr(HotnessFromPGO)) << "\n");
+ return Prefix;
+ }
+ }
if (!Count)
return "";
- // The accummulated counter shows the constant is hot. Return 'hot' whether
- // this variable is seen by unprofiled functions or not.
- if (PSI->isHotCount(*Count))
- return "hot";
- // The constant is not hot, and seen by unprofiled functions. We don't want to
- // assign it to unlikely sections, even if the counter says 'cold'. So return
- // an empty prefix before checking whether the counter is cold.
- if (ConstantWithoutCounts.count(C))
- return "";
- // The accummulated counter shows the constant is cold. Return 'unlikely'.
- if (PSI->isColdCount(*Count))
- return "unlikely";
- // The counter says lukewarm. Return an empty prefix.
- return "";
+ return hotnessToStr(getConstantHotnessUsingProfileCount(C, PSI, *Count));
}
bool StaticDataProfileInfoWrapperPass::doInitialization(Module &M) {
- Info.reset(new StaticDataProfileInfo());
+ bool EnableDataAccessProf = false;
+ if (auto *MD = mdconst::extract_or_null<ConstantInt>(
+ M.getModuleFlag("EnableDataAccessProf")))
+ EnableDataAccessProf = MD->getZExtValue();
+ Info.reset(new StaticDataProfileInfo(EnableDataAccessProf));
return false;
}
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 380b192..cf63285 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -329,10 +329,6 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) {
for (const auto &[Name, Info] : make_early_inc_range(ForwardRefVals)) {
if (StringRef(Name).starts_with("llvm.")) {
Intrinsic::ID IID = Intrinsic::lookupIntrinsicID(Name);
- if (IID == Intrinsic::not_intrinsic)
- // Don't do anything for unknown intrinsics.
- continue;
-
// Automatically create declarations for intrinsics. Intrinsics can only
// be called directly, so the call function type directly determines the
// declaration function type.
@@ -346,11 +342,26 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) {
return error(Info.second, "intrinsic can only be used as callee");
SmallVector<Type *> OverloadTys;
- if (!Intrinsic::getIntrinsicSignature(IID, CB->getFunctionType(),
- OverloadTys))
- return error(Info.second, "invalid intrinsic signature");
-
- U.set(Intrinsic::getOrInsertDeclaration(M, IID, OverloadTys));
+ if (IID != Intrinsic::not_intrinsic &&
+ Intrinsic::getIntrinsicSignature(IID, CB->getFunctionType(),
+ OverloadTys)) {
+ U.set(Intrinsic::getOrInsertDeclaration(M, IID, OverloadTys));
+ } else {
+ // Try to upgrade the intrinsic.
+ Function *TmpF = Function::Create(CB->getFunctionType(),
+ Function::ExternalLinkage, Name, M);
+ Function *NewF = nullptr;
+ if (!UpgradeIntrinsicFunction(TmpF, NewF)) {
+ if (IID == Intrinsic::not_intrinsic)
+ return error(Info.second, "unknown intrinsic '" + Name + "'");
+ return error(Info.second, "invalid intrinsic signature");
+ }
+
+ U.set(TmpF);
+ UpgradeIntrinsicCall(CB, NewF);
+ if (TmpF->use_empty())
+ TmpF->eraseFromParent();
+ }
}
Info.first->eraseFromParent();
@@ -1259,7 +1270,7 @@ bool LLParser::parseAliasOrIFunc(const std::string &Name, unsigned NameID,
if (parseToken(lltok::StringConstant, "expected partition string"))
return true;
} else if (!IsAlias && Lex.getKind() == lltok::MetadataVar) {
- if (parseGlobalObjectMetadataAttachment(*GI.get()))
+ if (parseGlobalObjectMetadataAttachment(*GI))
return true;
} else {
return tokError("unknown alias or ifunc property!");
@@ -5865,6 +5876,7 @@ bool LLParser::parseDICompileUnit(MDNode *&Result, bool IsDistinct) {
REQUIRED(file, MDField, (/* AllowNull */ false)); \
OPTIONAL(language, DwarfLangField, ); \
OPTIONAL(sourceLanguageName, DwarfSourceLangNameField, ); \
+ OPTIONAL(sourceLanguageVersion, MDUnsignedField, (0, UINT32_MAX)); \
OPTIONAL(producer, MDStringField, ); \
OPTIONAL(isOptimized, MDBoolField, ); \
OPTIONAL(flags, MDStringField, ); \
@@ -5894,10 +5906,15 @@ bool LLParser::parseDICompileUnit(MDNode *&Result, bool IsDistinct) {
return error(Loc, "can only specify one of 'language' and "
"'sourceLanguageName' on !DICompileUnit");
+ if (sourceLanguageVersion.Seen && !sourceLanguageName.Seen)
+ return error(Loc, "'sourceLanguageVersion' requires an associated "
+ "'sourceLanguageName' on !DICompileUnit");
+
Result = DICompileUnit::getDistinct(
Context,
language.Seen ? DISourceLanguageName(language.Val)
- : DISourceLanguageName(sourceLanguageName.Val, 0),
+ : DISourceLanguageName(sourceLanguageName.Val,
+ sourceLanguageVersion.Val),
file.Val, producer.Val, isOptimized.Val, flags.Val, runtimeVersion.Val,
splitDebugFilename.Val, emissionKind.Val, enums.Val, retainedTypes.Val,
globals.Val, imports.Val, macros.Val, dwoId.Val, splitDebugInlining.Val,
diff --git a/llvm/lib/BinaryFormat/XCOFF.cpp b/llvm/lib/BinaryFormat/XCOFF.cpp
index e0a4471..19d5b98 100644
--- a/llvm/lib/BinaryFormat/XCOFF.cpp
+++ b/llvm/lib/BinaryFormat/XCOFF.cpp
@@ -112,26 +112,26 @@ StringRef XCOFF::getNameForTracebackTableLanguageId(
XCOFF::CFileCpuId XCOFF::getCpuID(StringRef CPUName) {
StringRef CPU = PPC::normalizeCPUName(CPUName);
return StringSwitch<XCOFF::CFileCpuId>(CPU)
- .Cases("generic", "COM", XCOFF::TCPU_COM)
+ .Cases({"generic", "COM"}, XCOFF::TCPU_COM)
.Case("601", XCOFF::TCPU_601)
- .Cases("602", "603", "603e", "603ev", XCOFF::TCPU_603)
- .Cases("604", "604e", XCOFF::TCPU_604)
+ .Cases({"602", "603", "603e", "603ev"}, XCOFF::TCPU_603)
+ .Cases({"604", "604e"}, XCOFF::TCPU_604)
.Case("620", XCOFF::TCPU_620)
.Case("970", XCOFF::TCPU_970)
- .Cases("a2", "g3", "g4", "g5", "e500", XCOFF::TCPU_COM)
- .Cases("pwr3", "pwr4", XCOFF::TCPU_COM)
- .Cases("pwr5", "PWR5", XCOFF::TCPU_PWR5)
- .Cases("pwr5x", "PWR5X", XCOFF::TCPU_PWR5X)
- .Cases("pwr6", "PWR6", XCOFF::TCPU_PWR6)
- .Cases("pwr6x", "PWR6E", XCOFF::TCPU_PWR6E)
- .Cases("pwr7", "PWR7", XCOFF::TCPU_PWR7)
- .Cases("pwr8", "PWR8", XCOFF::TCPU_PWR8)
- .Cases("pwr9", "PWR9", XCOFF::TCPU_PWR9)
- .Cases("pwr10", "PWR10", XCOFF::TCPU_PWR10)
- .Cases("ppc", "PPC", "ppc32", "ppc64", XCOFF::TCPU_COM)
+ .Cases({"a2", "g3", "g4", "g5", "e500"}, XCOFF::TCPU_COM)
+ .Cases({"pwr3", "pwr4"}, XCOFF::TCPU_COM)
+ .Cases({"pwr5", "PWR5"}, XCOFF::TCPU_PWR5)
+ .Cases({"pwr5x", "PWR5X"}, XCOFF::TCPU_PWR5X)
+ .Cases({"pwr6", "PWR6"}, XCOFF::TCPU_PWR6)
+ .Cases({"pwr6x", "PWR6E"}, XCOFF::TCPU_PWR6E)
+ .Cases({"pwr7", "PWR7"}, XCOFF::TCPU_PWR7)
+ .Cases({"pwr8", "PWR8"}, XCOFF::TCPU_PWR8)
+ .Cases({"pwr9", "PWR9"}, XCOFF::TCPU_PWR9)
+ .Cases({"pwr10", "PWR10"}, XCOFF::TCPU_PWR10)
+ .Cases({"ppc", "PPC", "ppc32", "ppc64"}, XCOFF::TCPU_COM)
.Case("ppc64le", XCOFF::TCPU_PWR8)
.Case("future", XCOFF::TCPU_PWR10)
- .Cases("any", "ANY", XCOFF::TCPU_ANY)
+ .Cases({"any", "ANY"}, XCOFF::TCPU_ANY)
.Default(XCOFF::TCPU_INVALID);
}
diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
index cdcf7a8..ed0443f 100644
--- a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -1860,7 +1860,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
break;
}
case bitc::METADATA_COMPILE_UNIT: {
- if (Record.size() < 14 || Record.size() > 22)
+ if (Record.size() < 14 || Record.size() > 23)
return error("Invalid record");
// Ignore Record[0], which indicates whether this compile unit is
@@ -1869,11 +1869,13 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
const auto LangVersionMask = (uint64_t(1) << 63);
const bool HasVersionedLanguage = Record[1] & LangVersionMask;
+ const uint32_t LanguageVersion = Record.size() > 22 ? Record[22] : 0;
auto *CU = DICompileUnit::getDistinct(
Context,
HasVersionedLanguage
- ? DISourceLanguageName(Record[1] & ~LangVersionMask, 0)
+ ? DISourceLanguageName(Record[1] & ~LangVersionMask,
+ LanguageVersion)
: DISourceLanguageName(Record[1]),
getMDOrNull(Record[2]), getMDString(Record[3]), Record[4],
getMDString(Record[5]), Record[6], getMDString(Record[7]), Record[8],
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 54e916e..8ff3aa9 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -2142,6 +2142,7 @@ void ModuleBitcodeWriter::writeDICompileUnit(const DICompileUnit *N,
Record.push_back(N->getRangesBaseAddress());
Record.push_back(VE.getMetadataOrNullID(N->getRawSysRoot()));
Record.push_back(VE.getMetadataOrNullID(N->getRawSDK()));
+ Record.push_back(Lang.hasVersionedName() ? Lang.getVersion() : 0);
Stream.EmitRecord(bitc::METADATA_COMPILE_UNIT, Record, Abbrev);
Record.clear();
diff --git a/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp b/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp
index 323b21e..4e6f93e 100644
--- a/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp
+++ b/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp
@@ -1102,8 +1102,6 @@ void TrieRawHashMapHandle::print(
if (auto Err = Printer.printRecords())
OS << "error: " << toString(std::move(Err)) << "\n";
-
- return;
}
Error TrieRawHashMapHandle::validate(
diff --git a/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp b/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp
index 6356d71..873ac8f 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp
@@ -20,7 +20,7 @@
#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Target/TargetMachine.h"
-namespace llvm {
+using namespace llvm;
AIXException::AIXException(AsmPrinter *A) : EHStreamer(A) {}
@@ -90,5 +90,3 @@ void AIXException::endFunction(const MachineFunction *MF) {
emitExceptionInfoTable(LSDALabel, PerSym);
}
-
-} // End of namespace llvm
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 219bbc9..e2af0c5 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -119,6 +119,7 @@
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/VCSRevision.h"
+#include "llvm/Support/VirtualFileSystem.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Target/TargetMachine.h"
@@ -476,6 +477,7 @@ void AsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
}
bool AsmPrinter::doInitialization(Module &M) {
+ VFS = vfs::getRealFileSystem();
auto *MMIWP = getAnalysisIfAvailable<MachineModuleInfoWrapperPass>();
MMI = MMIWP ? &MMIWP->getMMI() : nullptr;
HasSplitStack = false;
@@ -1437,7 +1439,8 @@ getBBAddrMapFeature(const MachineFunction &MF, int NumMBBSectionRanges,
BrProbEnabled,
MF.hasBBSections() && NumMBBSectionRanges > 1,
static_cast<bool>(BBAddrMapSkipEmitBBEntries),
- HasCalls};
+ HasCalls,
+ false};
}
void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) {
@@ -1682,7 +1685,7 @@ static ConstantInt *extractNumericCGTypeId(const Function &F) {
return nullptr;
}
-/// Emits .callgraph section.
+/// Emits .llvm.callgraph section.
void AsmPrinter::emitCallGraphSection(const MachineFunction &MF,
FunctionCallGraphInfo &FuncCGInfo) {
if (!MF.getTarget().Options.EmitCallGraphSection)
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index c364ffc..8dd8b9da 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -36,6 +36,7 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/VirtualFileSystem.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
using namespace llvm;
@@ -98,6 +99,7 @@ void AsmPrinter::emitInlineAsm(StringRef Str, const MCSubtargetInfo &STI,
unsigned BufNum = addInlineAsmDiagBuffer(Str, LocMDNode);
SourceMgr &SrcMgr = *MMI->getContext().getInlineSourceManager();
SrcMgr.setIncludeDirs(MCOptions.IASSearchPaths);
+ SrcMgr.setVirtualFileSystem(VFS);
std::unique_ptr<MCAsmParser> Parser(
createMCAsmParser(SrcMgr, OutContext, *OutStreamer, *MAI, BufNum));
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 433877f..72582d7 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -2066,11 +2066,36 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) {
if (NoDebug)
return;
+ auto RecordLineZero = [&]() {
+ // Preserve the file and column numbers, if we can, to save space in
+ // the encoded line table.
+ // Do not update PrevInstLoc, it remembers the last non-0 line.
+ const MDNode *Scope = nullptr;
+ unsigned Column = 0;
+ if (PrevInstLoc) {
+ Scope = PrevInstLoc.getScope();
+ Column = PrevInstLoc.getCol();
+ }
+ recordSourceLine(/*Line=*/0, Column, Scope, /*Flags=*/0);
+ };
+
+ // When we emit a line-0 record, we don't update PrevInstLoc; so look at
+ // the last line number actually emitted, to see if it was line 0.
+ unsigned LastAsmLine =
+ Asm->OutStreamer->getContext().getCurrentDwarfLoc().getLine();
+
// Check if source location changes, but ignore DBG_VALUE and CFI locations.
// If the instruction is part of the function frame setup code, do not emit
// any line record, as there is no correspondence with any user code.
- if (MI->isMetaInstruction() || MI->getFlag(MachineInstr::FrameSetup))
+ if (MI->isMetaInstruction())
+ return;
+ if (MI->getFlag(MachineInstr::FrameSetup)) {
+ // Prevent a loc from the previous block leaking into frame setup instrs.
+ if (LastAsmLine && PrevInstBB && PrevInstBB != MI->getParent())
+ RecordLineZero();
return;
+ }
+
const DebugLoc &DL = MI->getDebugLoc();
unsigned Flags = 0;
@@ -2093,11 +2118,6 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) {
LocationString);
};
- // When we emit a line-0 record, we don't update PrevInstLoc; so look at
- // the last line number actually emitted, to see if it was line 0.
- unsigned LastAsmLine =
- Asm->OutStreamer->getContext().getCurrentDwarfLoc().getLine();
-
// There may be a mixture of scopes using and not using Key Instructions.
// Not-Key-Instructions functions inlined into Key Instructions functions
// should use not-key is_stmt handling. Key Instructions functions inlined
@@ -2163,18 +2183,8 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) {
// - Instruction is at the top of a block; we don't want to inherit the
// location from the physically previous (maybe unrelated) block.
if (UnknownLocations == Enable || PrevLabel ||
- (PrevInstBB && PrevInstBB != MI->getParent())) {
- // Preserve the file and column numbers, if we can, to save space in
- // the encoded line table.
- // Do not update PrevInstLoc, it remembers the last non-0 line.
- const MDNode *Scope = nullptr;
- unsigned Column = 0;
- if (PrevInstLoc) {
- Scope = PrevInstLoc.getScope();
- Column = PrevInstLoc.getCol();
- }
- recordSourceLine(/*Line=*/0, Column, Scope, /*Flags=*/0);
- }
+ (PrevInstBB && PrevInstBB != MI->getParent()))
+ RecordLineZero();
return;
}
diff --git a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
index 260ce8f..93ae548 100644
--- a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
+++ b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
@@ -85,8 +85,7 @@ template <> struct llvm::DenseMapInfo<VariableID> {
using VarLocInsertPt = PointerUnion<const Instruction *, const DbgRecord *>;
-namespace std {
-template <> struct hash<VarLocInsertPt> {
+template <> struct std::hash<VarLocInsertPt> {
using argument_type = VarLocInsertPt;
using result_type = std::size_t;
@@ -94,7 +93,6 @@ template <> struct hash<VarLocInsertPt> {
return std::hash<void *>()(Arg.getOpaqueValue());
}
};
-} // namespace std
/// Helper class to build FunctionVarLocs, since that class isn't easy to
/// modify. TODO: There's not a great deal of value in the split, it could be
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 4931403..53f1cfe2 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -770,7 +770,7 @@ struct PartwordMaskValues {
Value *Inv_Mask = nullptr;
};
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
raw_ostream &operator<<(raw_ostream &O, const PartwordMaskValues &PMV) {
auto PrintObj = [&O](auto *V) {
if (V)
diff --git a/llvm/lib/CodeGen/BasicBlockPathCloning.cpp b/llvm/lib/CodeGen/BasicBlockPathCloning.cpp
index fd7df6b..47b7a88 100644
--- a/llvm/lib/CodeGen/BasicBlockPathCloning.cpp
+++ b/llvm/lib/CodeGen/BasicBlockPathCloning.cpp
@@ -207,9 +207,7 @@ bool ApplyCloning(MachineFunction &MF,
}
return AnyPathsCloned;
}
-} // end anonymous namespace
-namespace llvm {
class BasicBlockPathCloning : public MachineFunctionPass {
public:
static char ID;
@@ -229,7 +227,7 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override;
};
-} // namespace llvm
+} // namespace
char BasicBlockPathCloning::ID = 0;
INITIALIZE_PASS_BEGIN(
diff --git a/llvm/lib/CodeGen/BranchRelaxation.cpp b/llvm/lib/CodeGen/BranchRelaxation.cpp
index 2d50167..fae952e 100644
--- a/llvm/lib/CodeGen/BranchRelaxation.cpp
+++ b/llvm/lib/CodeGen/BranchRelaxation.cpp
@@ -491,6 +491,20 @@ bool BranchRelaxation::fixupConditionalBranch(MachineInstr &MI) {
return true;
}
if (FBB) {
+ // If we get here with a MBB which ends like this:
+ //
+ // bb.1:
+ // successors: %bb.2;
+ // ...
+ // BNE $x1, $x0, %bb.2
+ // PseudoBR %bb.2
+ //
+ // Just remove conditional branch.
+ if (TBB == FBB) {
+ removeBranch(MBB);
+ insertUncondBranch(MBB, TBB);
+ return true;
+ }
// We need to split the basic block here to obtain two long-range
// unconditional branches.
NewBB = createNewBlockAfter(*MBB);
diff --git a/llvm/lib/CodeGen/BreakFalseDeps.cpp b/llvm/lib/CodeGen/BreakFalseDeps.cpp
index 28e6728..1846880 100644
--- a/llvm/lib/CodeGen/BreakFalseDeps.cpp
+++ b/llvm/lib/CodeGen/BreakFalseDeps.cpp
@@ -31,7 +31,7 @@
using namespace llvm;
-namespace llvm {
+namespace {
class BreakFalseDeps : public MachineFunctionPass {
private:
@@ -95,7 +95,7 @@ private:
void processUndefReads(MachineBasicBlock *);
};
-} // namespace llvm
+} // namespace
#define DEBUG_TYPE "break-false-deps"
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 4320b1d..9e78ec9 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -819,7 +819,7 @@ void CodeGenPrepare::removeAllAssertingVHReferences(Value *V) {
}
// Verify BFI has been updated correctly by recomputing BFI and comparing them.
-void LLVM_ATTRIBUTE_UNUSED CodeGenPrepare::verifyBFIUpdates(Function &F) {
+[[maybe_unused]] void CodeGenPrepare::verifyBFIUpdates(Function &F) {
DominatorTree NewDT(F);
LoopInfo NewLI(NewDT);
BranchProbabilityInfo NewBPI(F, NewLI, TLInfo);
diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index 6c2a5a7..87ada87 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -126,8 +126,7 @@ hash_code hash_value(const ComplexValue &Arg) {
} // end namespace
typedef SmallVector<struct ComplexValue, 2> ComplexValues;
-namespace llvm {
-template <> struct DenseMapInfo<ComplexValue> {
+template <> struct llvm::DenseMapInfo<ComplexValue> {
static inline ComplexValue getEmptyKey() {
return {DenseMapInfo<Value *>::getEmptyKey(),
DenseMapInfo<Value *>::getEmptyKey()};
@@ -144,7 +143,6 @@ template <> struct DenseMapInfo<ComplexValue> {
return LHS.Real == RHS.Real && LHS.Imag == RHS.Imag;
}
};
-} // end namespace llvm
namespace {
template <typename T, typename IterT>
diff --git a/llvm/lib/CodeGen/EdgeBundles.cpp b/llvm/lib/CodeGen/EdgeBundles.cpp
index f4335396..50dd66f 100644
--- a/llvm/lib/CodeGen/EdgeBundles.cpp
+++ b/llvm/lib/CodeGen/EdgeBundles.cpp
@@ -81,13 +81,10 @@ void EdgeBundles::init() {
}
}
-namespace llvm {
-
/// Specialize WriteGraph, the standard implementation won't work.
-template<>
-raw_ostream &WriteGraph<>(raw_ostream &O, const EdgeBundles &G,
- bool ShortNames,
- const Twine &Title) {
+template <>
+raw_ostream &llvm::WriteGraph<>(raw_ostream &O, const EdgeBundles &G,
+ bool ShortNames, const Twine &Title) {
const MachineFunction *MF = G.getMachineFunction();
O << "digraph {\n";
@@ -107,8 +104,6 @@ raw_ostream &WriteGraph<>(raw_ostream &O, const EdgeBundles &G,
return O;
}
-} // end namespace llvm
-
/// view - Visualize the annotated bipartite CFG with Graphviz.
void EdgeBundles::view() const {
ViewGraph(*this, "EdgeBundles");
diff --git a/llvm/lib/CodeGen/ExpandFp.cpp b/llvm/lib/CodeGen/ExpandFp.cpp
index 9cc6c6a..04c7008 100644
--- a/llvm/lib/CodeGen/ExpandFp.cpp
+++ b/llvm/lib/CodeGen/ExpandFp.cpp
@@ -82,7 +82,7 @@ public:
}
static FRemExpander create(IRBuilder<> &B, Type *Ty) {
- assert(canExpandType(Ty));
+ assert(canExpandType(Ty) && "Expected supported floating point type");
// The type to use for the computation of the remainder. This may be
// wider than the input/result type which affects the ...
@@ -356,8 +356,9 @@ Value *FRemExpander::buildFRem(Value *X, Value *Y,
static bool expandFRem(BinaryOperator &I, std::optional<SimplifyQuery> &SQ) {
LLVM_DEBUG(dbgs() << "Expanding instruction: " << I << '\n');
- Type *ReturnTy = I.getType();
- assert(FRemExpander::canExpandType(ReturnTy->getScalarType()));
+ Type *Ty = I.getType();
+ assert(FRemExpander::canExpandType(Ty) &&
+ "Expected supported floating point type");
FastMathFlags FMF = I.getFastMathFlags();
// TODO Make use of those flags for optimization?
@@ -368,32 +369,10 @@ static bool expandFRem(BinaryOperator &I, std::optional<SimplifyQuery> &SQ) {
B.setFastMathFlags(FMF);
B.SetCurrentDebugLocation(I.getDebugLoc());
- Type *ElemTy = ReturnTy->getScalarType();
- const FRemExpander Expander = FRemExpander::create(B, ElemTy);
-
- Value *Ret;
- if (ReturnTy->isFloatingPointTy())
- Ret = FMF.approxFunc()
- ? Expander.buildApproxFRem(I.getOperand(0), I.getOperand(1))
- : Expander.buildFRem(I.getOperand(0), I.getOperand(1), SQ);
- else {
- auto *VecTy = cast<FixedVectorType>(ReturnTy);
-
- // This could use SplitBlockAndInsertForEachLane but the interface
- // is a bit awkward for a constant number of elements and it will
- // boil down to the same code.
- // TODO Expand the FRem instruction only once and reuse the code.
- Value *Nums = I.getOperand(0);
- Value *Denums = I.getOperand(1);
- Ret = PoisonValue::get(I.getType());
- for (int I = 0, E = VecTy->getNumElements(); I != E; ++I) {
- Value *Num = B.CreateExtractElement(Nums, I);
- Value *Denum = B.CreateExtractElement(Denums, I);
- Value *Rem = FMF.approxFunc() ? Expander.buildApproxFRem(Num, Denum)
- : Expander.buildFRem(Num, Denum, SQ);
- Ret = B.CreateInsertElement(Ret, Rem, I);
- }
- }
+ const FRemExpander Expander = FRemExpander::create(B, Ty);
+ Value *Ret = FMF.approxFunc()
+ ? Expander.buildApproxFRem(I.getOperand(0), I.getOperand(1))
+ : Expander.buildFRem(I.getOperand(0), I.getOperand(1), SQ);
I.replaceAllUsesWith(Ret);
Ret->takeName(&I);
@@ -939,7 +918,8 @@ static void expandIToFP(Instruction *IToFP) {
IToFP->eraseFromParent();
}
-static void scalarize(Instruction *I, SmallVectorImpl<Instruction *> &Replace) {
+static void scalarize(Instruction *I,
+ SmallVectorImpl<Instruction *> &Worklist) {
VectorType *VTy = cast<FixedVectorType>(I->getType());
IRBuilder<> Builder(I);
@@ -948,12 +928,25 @@ static void scalarize(Instruction *I, SmallVectorImpl<Instruction *> &Replace) {
Value *Result = PoisonValue::get(VTy);
for (unsigned Idx = 0; Idx < NumElements; ++Idx) {
Value *Ext = Builder.CreateExtractElement(I->getOperand(0), Idx);
- Value *Cast = Builder.CreateCast(cast<CastInst>(I)->getOpcode(), Ext,
- I->getType()->getScalarType());
- Result = Builder.CreateInsertElement(Result, Cast, Idx);
- if (isa<Instruction>(Cast))
- Replace.push_back(cast<Instruction>(Cast));
+
+ Value *NewOp = nullptr;
+ if (auto *BinOp = dyn_cast<BinaryOperator>(I))
+ NewOp = Builder.CreateBinOp(
+ BinOp->getOpcode(), Ext,
+ Builder.CreateExtractElement(I->getOperand(1), Idx));
+ else if (auto *CastI = dyn_cast<CastInst>(I))
+ NewOp = Builder.CreateCast(CastI->getOpcode(), Ext,
+ I->getType()->getScalarType());
+ else
+ llvm_unreachable("Unsupported instruction type");
+
+ Result = Builder.CreateInsertElement(Result, NewOp, Idx);
+ if (auto *ScalarizedI = dyn_cast<Instruction>(NewOp)) {
+ ScalarizedI->copyIRFlags(I, true);
+ Worklist.push_back(ScalarizedI);
+ }
}
+
I->replaceAllUsesWith(Result);
I->dropAllReferences();
I->eraseFromParent();
@@ -989,10 +982,17 @@ static bool targetSupportsFrem(const TargetLowering &TLI, Type *Ty) {
return TLI.getLibcallName(fremToLibcall(Ty->getScalarType()));
}
+static void addToWorklist(Instruction &I,
+ SmallVector<Instruction *, 4> &Worklist) {
+ if (I.getOperand(0)->getType()->isVectorTy())
+ scalarize(&I, Worklist);
+ else
+ Worklist.push_back(&I);
+}
+
static bool runImpl(Function &F, const TargetLowering &TLI,
AssumptionCache *AC) {
- SmallVector<Instruction *, 4> Replace;
- SmallVector<Instruction *, 4> ReplaceVector;
+ SmallVector<Instruction *, 4> Worklist;
bool Modified = false;
unsigned MaxLegalFpConvertBitWidth =
@@ -1003,55 +1003,39 @@ static bool runImpl(Function &F, const TargetLowering &TLI,
if (MaxLegalFpConvertBitWidth >= llvm::IntegerType::MAX_INT_BITS)
return false;
- for (auto &I : instructions(F)) {
- switch (I.getOpcode()) {
- case Instruction::FRem: {
- Type *Ty = I.getType();
- // TODO: This pass doesn't handle scalable vectors.
- if (Ty->isScalableTy())
- continue;
-
- if (targetSupportsFrem(TLI, Ty) ||
- !FRemExpander::canExpandType(Ty->getScalarType()))
- continue;
-
- Replace.push_back(&I);
- Modified = true;
+ for (auto It = inst_begin(&F), End = inst_end(F); It != End;) {
+ Instruction &I = *It++;
+ Type *Ty = I.getType();
+ // TODO: This pass doesn't handle scalable vectors.
+ if (Ty->isScalableTy())
+ continue;
+ switch (I.getOpcode()) {
+ case Instruction::FRem:
+ if (!targetSupportsFrem(TLI, Ty) &&
+ FRemExpander::canExpandType(Ty->getScalarType())) {
+ addToWorklist(I, Worklist);
+ Modified = true;
+ }
break;
- }
case Instruction::FPToUI:
case Instruction::FPToSI: {
- // TODO: This pass doesn't handle scalable vectors.
- if (I.getOperand(0)->getType()->isScalableTy())
- continue;
-
- auto *IntTy = cast<IntegerType>(I.getType()->getScalarType());
+ auto *IntTy = cast<IntegerType>(Ty->getScalarType());
if (IntTy->getIntegerBitWidth() <= MaxLegalFpConvertBitWidth)
continue;
- if (I.getOperand(0)->getType()->isVectorTy())
- ReplaceVector.push_back(&I);
- else
- Replace.push_back(&I);
+ addToWorklist(I, Worklist);
Modified = true;
break;
}
case Instruction::UIToFP:
case Instruction::SIToFP: {
- // TODO: This pass doesn't handle scalable vectors.
- if (I.getOperand(0)->getType()->isScalableTy())
- continue;
-
auto *IntTy =
cast<IntegerType>(I.getOperand(0)->getType()->getScalarType());
if (IntTy->getIntegerBitWidth() <= MaxLegalFpConvertBitWidth)
continue;
- if (I.getOperand(0)->getType()->isVectorTy())
- ReplaceVector.push_back(&I);
- else
- Replace.push_back(&I);
+ addToWorklist(I, Worklist);
Modified = true;
break;
}
@@ -1060,16 +1044,8 @@ static bool runImpl(Function &F, const TargetLowering &TLI,
}
}
- while (!ReplaceVector.empty()) {
- Instruction *I = ReplaceVector.pop_back_val();
- scalarize(I, Replace);
- }
-
- if (Replace.empty())
- return false;
-
- while (!Replace.empty()) {
- Instruction *I = Replace.pop_back_val();
+ while (!Worklist.empty()) {
+ Instruction *I = Worklist.pop_back_val();
if (I->getOpcode() == Instruction::FRem) {
auto SQ = [&]() -> std::optional<SimplifyQuery> {
if (AC) {
diff --git a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
index 90c60d4..04d9309 100644
--- a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
@@ -112,7 +112,7 @@ APInt GISelValueTracking::getKnownOnes(Register R) {
return getKnownBits(R).One;
}
-LLVM_ATTRIBUTE_UNUSED static void
+[[maybe_unused]] static void
dumpResult(const MachineInstr &MI, const KnownBits &Known, unsigned Depth) {
dbgs() << "[" << Depth << "] Compute known bits: " << MI << "[" << Depth
<< "] Computed for: " << MI << "[" << Depth << "] Known: 0x"
@@ -1975,6 +1975,81 @@ unsigned GISelValueTracking::computeNumSignBits(Register R,
break;
}
+ case TargetOpcode::G_SUB: {
+ Register Src2 = MI.getOperand(2).getReg();
+ unsigned Src2NumSignBits =
+ computeNumSignBits(Src2, DemandedElts, Depth + 1);
+ if (Src2NumSignBits == 1)
+ return 1; // Early out.
+
+ // Handle NEG.
+ Register Src1 = MI.getOperand(1).getReg();
+ KnownBits Known1 = getKnownBits(Src1, DemandedElts, Depth);
+ if (Known1.isZero()) {
+ KnownBits Known2 = getKnownBits(Src2, DemandedElts, Depth);
+ // If the input is known to be 0 or 1, the output is 0/-1, which is all
+ // sign bits set.
+ if ((Known2.Zero | 1).isAllOnes())
+ return TyBits;
+
+ // If the input is known to be positive (the sign bit is known clear),
+ // the output of the NEG has, at worst, the same number of sign bits as
+ // the input.
+ if (Known2.isNonNegative()) {
+ FirstAnswer = Src2NumSignBits;
+ break;
+ }
+
+ // Otherwise, we treat this like a SUB.
+ }
+
+ unsigned Src1NumSignBits =
+ computeNumSignBits(Src1, DemandedElts, Depth + 1);
+ if (Src1NumSignBits == 1)
+ return 1; // Early Out.
+
+ // Sub can have at most one carry bit. Thus we know that the output
+ // is, at worst, one more bit than the inputs.
+ FirstAnswer = std::min(Src1NumSignBits, Src2NumSignBits) - 1;
+ break;
+ }
+ case TargetOpcode::G_ADD: {
+ Register Src2 = MI.getOperand(2).getReg();
+ unsigned Src2NumSignBits =
+ computeNumSignBits(Src2, DemandedElts, Depth + 1);
+ if (Src2NumSignBits <= 2)
+ return 1; // Early out.
+
+ Register Src1 = MI.getOperand(1).getReg();
+ unsigned Src1NumSignBits =
+ computeNumSignBits(Src1, DemandedElts, Depth + 1);
+ if (Src1NumSignBits == 1)
+ return 1; // Early Out.
+
+ // Special case decrementing a value (ADD X, -1):
+ KnownBits Known2 = getKnownBits(Src2, DemandedElts, Depth);
+ if (Known2.isAllOnes()) {
+ KnownBits Known1 = getKnownBits(Src1, DemandedElts, Depth);
+ // If the input is known to be 0 or 1, the output is 0/-1, which is all
+ // sign bits set.
+ if ((Known1.Zero | 1).isAllOnes())
+ return TyBits;
+
+ // If we are subtracting one from a positive number, there is no carry
+ // out of the result.
+ if (Known1.isNonNegative()) {
+ FirstAnswer = Src1NumSignBits;
+ break;
+ }
+
+ // Otherwise, we treat this like an ADD.
+ }
+
+ // Add can have at most one carry bit. Thus we know that the output
+ // is, at worst, one more bit than the inputs.
+ FirstAnswer = std::min(Src1NumSignBits, Src2NumSignBits) - 1;
+ break;
+ }
case TargetOpcode::G_FCMP:
case TargetOpcode::G_ICMP: {
bool IsFP = Opcode == TargetOpcode::G_FCMP;
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp
index 25c1db9..ded4df4 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp
@@ -55,12 +55,10 @@ LegalizeMutation LegalizeMutations::changeElementCountTo(unsigned TypeIdx,
}
LegalizeMutation LegalizeMutations::changeElementCountTo(unsigned TypeIdx,
- LLT NewEltTy) {
+ ElementCount EC) {
return [=](const LegalityQuery &Query) {
const LLT OldTy = Query.Types[TypeIdx];
- ElementCount NewEltCount = NewEltTy.isVector() ? NewEltTy.getElementCount()
- : ElementCount::getFixed(1);
- return std::make_pair(TypeIdx, OldTy.changeElementCount(NewEltCount));
+ return std::make_pair(TypeIdx, OldTy.changeElementCount(EC));
};
}
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index cffaf7c..38ec83f 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -3292,8 +3292,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
if (TypeIdx != 2)
return UnableToLegalize;
Observer.changingInstr(MI);
- // TODO: Probably should be zext
- widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
+ widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
Observer.changedInstr(MI);
return Legalized;
}
@@ -3325,8 +3324,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
if (TypeIdx == 2) {
Observer.changingInstr(MI);
- // TODO: Probably should be zext
- widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
+ widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
Observer.changedInstr(MI);
return Legalized;
}
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index 055fdc6..ca82857 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -818,8 +818,7 @@ bool llvm::isKnownNeverNaN(Register Val, const MachineRegisterInfo &MRI,
if (!DefMI)
return false;
- const TargetMachine& TM = DefMI->getMF()->getTarget();
- if (DefMI->getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath)
+ if (DefMI->getFlag(MachineInstr::FmNoNans))
return true;
// If the value is a constant, we can obviously see if it is a NaN or not.
diff --git a/llvm/lib/CodeGen/GlobalMergeFunctions.cpp b/llvm/lib/CodeGen/GlobalMergeFunctions.cpp
index 47640c4a..81ab317 100644
--- a/llvm/lib/CodeGen/GlobalMergeFunctions.cpp
+++ b/llvm/lib/CodeGen/GlobalMergeFunctions.cpp
@@ -587,16 +587,12 @@ public:
} // namespace
char GlobalMergeFuncPassWrapper::ID = 0;
-INITIALIZE_PASS_BEGIN(GlobalMergeFuncPassWrapper, "global-merge-func",
- "Global merge function pass", false, false)
-INITIALIZE_PASS_END(GlobalMergeFuncPassWrapper, "global-merge-func",
- "Global merge function pass", false, false)
+INITIALIZE_PASS(GlobalMergeFuncPassWrapper, "global-merge-func",
+ "Global merge function pass", false, false)
-namespace llvm {
-ModulePass *createGlobalMergeFuncPass() {
+ModulePass *llvm::createGlobalMergeFuncPass() {
return new GlobalMergeFuncPassWrapper();
}
-} // namespace llvm
GlobalMergeFuncPassWrapper::GlobalMergeFuncPassWrapper() : ModulePass(ID) {
initializeGlobalMergeFuncPassWrapperPass(
diff --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp
index 3485a27..d2f2c3e 100644
--- a/llvm/lib/CodeGen/LiveIntervals.cpp
+++ b/llvm/lib/CodeGen/LiveIntervals.cpp
@@ -101,15 +101,11 @@ static cl::opt<bool> EnablePrecomputePhysRegs(
static bool EnablePrecomputePhysRegs = false;
#endif // NDEBUG
-namespace llvm {
-
-cl::opt<bool> UseSegmentSetForPhysRegs(
+cl::opt<bool> llvm::UseSegmentSetForPhysRegs(
"use-segment-set-for-physregs", cl::Hidden, cl::init(true),
cl::desc(
"Use segment set for the computation of the live ranges of physregs."));
-} // end namespace llvm
-
void LiveIntervalsWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
AU.setPreservesCFG();
AU.addPreserved<LiveVariablesWrapperPass>();
@@ -665,7 +661,10 @@ void LiveIntervals::extendToIndices(LiveRange &LR,
void LiveIntervals::pruneValue(LiveRange &LR, SlotIndex Kill,
SmallVectorImpl<SlotIndex> *EndPoints) {
LiveQueryResult LRQ = LR.Query(Kill);
- VNInfo *VNI = LRQ.valueOutOrDead();
+ // LR may have liveness reachable from early clobber slot, which may be
+ // only live-in instead of live-out of the instruction.
+ // For example, LR =[1r, 3r), Kill = 3e, we have to prune [3e, 3r) of LR.
+ VNInfo *VNI = LRQ.valueOutOrDead() ? LRQ.valueOutOrDead() : LRQ.valueIn();
if (!VNI)
return;
diff --git a/llvm/lib/CodeGen/MIR2Vec.cpp b/llvm/lib/CodeGen/MIR2Vec.cpp
index e859765..5c78d98 100644
--- a/llvm/lib/CodeGen/MIR2Vec.cpp
+++ b/llvm/lib/CodeGen/MIR2Vec.cpp
@@ -29,20 +29,17 @@ using namespace mir2vec;
STATISTIC(MIRVocabMissCounter,
"Number of lookups to MIR entities not present in the vocabulary");
-namespace llvm {
-namespace mir2vec {
-cl::OptionCategory MIR2VecCategory("MIR2Vec Options");
+cl::OptionCategory llvm::mir2vec::MIR2VecCategory("MIR2Vec Options");
// FIXME: Use a default vocab when not specified
static cl::opt<std::string>
VocabFile("mir2vec-vocab-path", cl::Optional,
cl::desc("Path to the vocabulary file for MIR2Vec"), cl::init(""),
cl::cat(MIR2VecCategory));
-cl::opt<float> OpcWeight("mir2vec-opc-weight", cl::Optional, cl::init(1.0),
- cl::desc("Weight for machine opcode embeddings"),
- cl::cat(MIR2VecCategory));
-} // namespace mir2vec
-} // namespace llvm
+cl::opt<float>
+ llvm::mir2vec::OpcWeight("mir2vec-opc-weight", cl::Optional, cl::init(1.0),
+ cl::desc("Weight for machine opcode embeddings"),
+ cl::cat(MIR2VecCategory));
//===----------------------------------------------------------------------===//
// Vocabulary Implementation
diff --git a/llvm/lib/CodeGen/MIRFSDiscriminator.cpp b/llvm/lib/CodeGen/MIRFSDiscriminator.cpp
index f5146f5..d988a2a 100644
--- a/llvm/lib/CodeGen/MIRFSDiscriminator.cpp
+++ b/llvm/lib/CodeGen/MIRFSDiscriminator.cpp
@@ -40,7 +40,7 @@ cl::opt<bool> ImprovedFSDiscriminator(
"improved-fs-discriminator", cl::Hidden, cl::init(false),
cl::desc("New FS discriminators encoding (incompatible with the original "
"encoding)"));
-}
+} // namespace llvm
char MIRAddFSDiscriminators::ID = 0;
diff --git a/llvm/lib/CodeGen/MIRNamerPass.cpp b/llvm/lib/CodeGen/MIRNamerPass.cpp
index bc65700..cbf8867 100644
--- a/llvm/lib/CodeGen/MIRNamerPass.cpp
+++ b/llvm/lib/CodeGen/MIRNamerPass.cpp
@@ -23,10 +23,6 @@
using namespace llvm;
-namespace llvm {
-extern char &MIRNamerID;
-} // namespace llvm
-
#define DEBUG_TYPE "mir-namer"
namespace {
@@ -53,10 +49,9 @@ public:
VRegRenamer Renamer(MF.getRegInfo());
- unsigned BBIndex = 0;
ReversePostOrderTraversal<MachineBasicBlock *> RPOT(&*MF.begin());
- for (auto &MBB : RPOT)
- Changed |= Renamer.renameVRegs(MBB, BBIndex++);
+ for (const auto &[BBIndex, MBB] : enumerate(RPOT))
+ Changed |= Renamer.renameVRegs(MBB, BBIndex);
return Changed;
}
@@ -66,10 +61,4 @@ public:
char MIRNamer::ID;
-char &llvm::MIRNamerID = MIRNamer::ID;
-
-INITIALIZE_PASS_BEGIN(MIRNamer, "mir-namer", "Rename Register Operands", false,
- false)
-
-INITIALIZE_PASS_END(MIRNamer, "mir-namer", "Rename Register Operands", false,
- false)
+INITIALIZE_PASS(MIRNamer, "mir-namer", "Rename Register Operands", false, false)
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index bf8a6cd..1d54d72 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -107,10 +107,8 @@ struct MFPrintState {
} // end anonymous namespace
-namespace llvm::yaml {
-
/// This struct serializes the LLVM IR module.
-template <> struct BlockScalarTraits<Module> {
+template <> struct yaml::BlockScalarTraits<Module> {
static void output(const Module &Mod, void *Ctxt, raw_ostream &OS) {
Mod.print(OS, nullptr);
}
@@ -121,8 +119,6 @@ template <> struct BlockScalarTraits<Module> {
}
};
-} // end namespace llvm::yaml
-
static void printRegMIR(Register Reg, yaml::StringValue &Dest,
const TargetRegisterInfo *TRI) {
raw_string_ostream OS(Dest.Value);
@@ -866,48 +862,46 @@ static void printMI(raw_ostream &OS, MFPrintState &State,
OS << TII->getName(MI.getOpcode());
- LS = ListSeparator();
+ // Print a space after the opcode if any additional tokens are printed.
+ LS = ListSeparator(", ", " ");
- if (I < E) {
- OS << ' ';
- for (; I < E; ++I) {
- OS << LS;
- printMIOperand(OS, State, MI, I, TRI, TII, ShouldPrintRegisterTies,
- PrintedTypes, MRI, /*PrintDef=*/true);
- }
+ for (; I < E; ++I) {
+ OS << LS;
+ printMIOperand(OS, State, MI, I, TRI, TII, ShouldPrintRegisterTies,
+ PrintedTypes, MRI, /*PrintDef=*/true);
}
// Print any optional symbols attached to this instruction as-if they were
// operands.
if (MCSymbol *PreInstrSymbol = MI.getPreInstrSymbol()) {
- OS << LS << " pre-instr-symbol ";
+ OS << LS << "pre-instr-symbol ";
MachineOperand::printSymbol(OS, *PreInstrSymbol);
}
if (MCSymbol *PostInstrSymbol = MI.getPostInstrSymbol()) {
- OS << LS << " post-instr-symbol ";
+ OS << LS << "post-instr-symbol ";
MachineOperand::printSymbol(OS, *PostInstrSymbol);
}
if (MDNode *HeapAllocMarker = MI.getHeapAllocMarker()) {
- OS << LS << " heap-alloc-marker ";
+ OS << LS << "heap-alloc-marker ";
HeapAllocMarker->printAsOperand(OS, State.MST);
}
if (MDNode *PCSections = MI.getPCSections()) {
- OS << LS << " pcsections ";
+ OS << LS << "pcsections ";
PCSections->printAsOperand(OS, State.MST);
}
if (MDNode *MMRA = MI.getMMRAMetadata()) {
- OS << LS << " mmra ";
+ OS << LS << "mmra ";
MMRA->printAsOperand(OS, State.MST);
}
if (uint32_t CFIType = MI.getCFIType())
- OS << LS << " cfi-type " << CFIType;
+ OS << LS << "cfi-type " << CFIType;
if (auto Num = MI.peekDebugInstrNum())
- OS << LS << " debug-instr-number " << Num;
+ OS << LS << "debug-instr-number " << Num;
if (PrintLocations) {
if (const DebugLoc &DL = MI.getDebugLoc()) {
- OS << LS << " debug-location ";
+ OS << LS << "debug-location ";
DL->printAsOperand(OS, State.MST);
}
}
diff --git a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
index b2731b69..a72c2c4 100644
--- a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
+++ b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
@@ -97,7 +97,9 @@ static const bool EnableDevelopmentFeatures = false;
/// this happens only in development mode. It's a no-op otherwise.
namespace llvm {
extern cl::opt<unsigned> EvictInterferenceCutoff;
+} // namespace llvm
+namespace {
class RegAllocScoring : public MachineFunctionPass {
public:
static char ID;
@@ -124,11 +126,12 @@ public:
/// Performs this pass
bool runOnMachineFunction(MachineFunction &) override;
};
+} // namespace
char RegAllocScoring::ID = 0;
-FunctionPass *createRegAllocScoringPass() { return new RegAllocScoring(); }
-
-} // namespace llvm
+FunctionPass *llvm::createRegAllocScoringPass() {
+ return new RegAllocScoring();
+}
INITIALIZE_PASS(RegAllocScoring, "regallocscoringpass",
"Register Allocation Scoring Pass", false, false)
diff --git a/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
index e7fa082..26eb10f 100644
--- a/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
+++ b/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
@@ -29,7 +29,6 @@ using namespace llvm;
#define DEBUG_TYPE "machine-block-freq"
-namespace llvm {
static cl::opt<GVDAGType> ViewMachineBlockFreqPropagationDAG(
"view-machine-block-freq-propagation-dags", cl::Hidden,
cl::desc("Pop up a window to show a dag displaying how machine block "
@@ -44,6 +43,7 @@ static cl::opt<GVDAGType> ViewMachineBlockFreqPropagationDAG(
clEnumValN(GVDT_Count, "count", "display a graph using the real "
"profile count if available.")));
+namespace llvm {
// Similar option above, but used to control BFI display only after MBP pass
cl::opt<GVDAGType> ViewBlockLayoutWithBFI(
"view-block-layout-with-bfi", cl::Hidden,
@@ -69,15 +69,15 @@ extern cl::opt<std::string> ViewBlockFreqFuncName;
// Defined in Analysis/BlockFrequencyInfo.cpp: -view-hot-freq-perc=
extern cl::opt<unsigned> ViewHotFreqPercent;
-static cl::opt<bool> PrintMachineBlockFreq(
- "print-machine-bfi", cl::init(false), cl::Hidden,
- cl::desc("Print the machine block frequency info."));
-
// Command line option to specify the name of the function for block frequency
// dump. Defined in Analysis/BlockFrequencyInfo.cpp.
extern cl::opt<std::string> PrintBFIFuncName;
} // namespace llvm
+static cl::opt<bool>
+ PrintMachineBlockFreq("print-machine-bfi", cl::init(false), cl::Hidden,
+ cl::desc("Print the machine block frequency info."));
+
static GVDAGType getGVDT() {
if (ViewBlockLayoutWithBFI != GVDT_None)
return ViewBlockLayoutWithBFI;
@@ -85,9 +85,7 @@ static GVDAGType getGVDT() {
return ViewMachineBlockFreqPropagationDAG;
}
-namespace llvm {
-
-template <> struct GraphTraits<MachineBlockFrequencyInfo *> {
+template <> struct llvm::GraphTraits<MachineBlockFrequencyInfo *> {
using NodeRef = const MachineBasicBlock *;
using ChildIteratorType = MachineBasicBlock::const_succ_iterator;
using nodes_iterator = pointer_iterator<MachineFunction::const_iterator>;
@@ -116,7 +114,7 @@ using MBFIDOTGraphTraitsBase =
MachineBranchProbabilityInfo>;
template <>
-struct DOTGraphTraits<MachineBlockFrequencyInfo *>
+struct llvm::DOTGraphTraits<MachineBlockFrequencyInfo *>
: public MBFIDOTGraphTraitsBase {
const MachineFunction *CurFunc = nullptr;
DenseMap<const MachineBasicBlock *, int> LayoutOrderMap;
@@ -159,8 +157,6 @@ struct DOTGraphTraits<MachineBlockFrequencyInfo *>
}
};
-} // end namespace llvm
-
AnalysisKey MachineBlockFrequencyAnalysis::Key;
MachineBlockFrequencyAnalysis::Result
diff --git a/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp b/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp
index 2e92dd8..7ca4582 100644
--- a/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp
+++ b/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp
@@ -18,13 +18,8 @@
using namespace llvm;
-INITIALIZE_PASS_BEGIN(MachineBranchProbabilityInfoWrapperPass,
- "machine-branch-prob",
- "Machine Branch Probability Analysis", false, true)
-INITIALIZE_PASS_END(MachineBranchProbabilityInfoWrapperPass,
- "machine-branch-prob",
- "Machine Branch Probability Analysis", false, true)
-
+INITIALIZE_PASS(MachineBranchProbabilityInfoWrapperPass, "machine-branch-prob",
+ "Machine Branch Probability Analysis", false, true)
namespace llvm {
cl::opt<unsigned>
StaticLikelyProb("static-likely-prob",
diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index e359831..ea08365 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -1257,7 +1257,7 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock(
Tracker.clear();
}
-static void LLVM_ATTRIBUTE_UNUSED printSpillReloadChain(
+[[maybe_unused]] static void printSpillReloadChain(
DenseMap<MachineInstr *, SmallVector<MachineInstr *>> &SpillChain,
DenseMap<MachineInstr *, SmallVector<MachineInstr *>> &ReloadChain,
MachineInstr *Leader) {
diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index 224231c..bfa5ab2 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -719,43 +719,41 @@ MachineFunction::CallSiteInfo::CallSiteInfo(const CallBase &CB) {
}
}
-namespace llvm {
+template <>
+struct llvm::DOTGraphTraits<const MachineFunction *>
+ : public DefaultDOTGraphTraits {
+ DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
- template<>
- struct DOTGraphTraits<const MachineFunction*> : public DefaultDOTGraphTraits {
- DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
+ static std::string getGraphName(const MachineFunction *F) {
+ return ("CFG for '" + F->getName() + "' function").str();
+ }
- static std::string getGraphName(const MachineFunction *F) {
- return ("CFG for '" + F->getName() + "' function").str();
+ std::string getNodeLabel(const MachineBasicBlock *Node,
+ const MachineFunction *Graph) {
+ std::string OutStr;
+ {
+ raw_string_ostream OSS(OutStr);
+
+ if (isSimple()) {
+ OSS << printMBBReference(*Node);
+ if (const BasicBlock *BB = Node->getBasicBlock())
+ OSS << ": " << BB->getName();
+ } else
+ Node->print(OSS);
}
- std::string getNodeLabel(const MachineBasicBlock *Node,
- const MachineFunction *Graph) {
- std::string OutStr;
- {
- raw_string_ostream OSS(OutStr);
-
- if (isSimple()) {
- OSS << printMBBReference(*Node);
- if (const BasicBlock *BB = Node->getBasicBlock())
- OSS << ": " << BB->getName();
- } else
- Node->print(OSS);
- }
-
- if (OutStr[0] == '\n') OutStr.erase(OutStr.begin());
-
- // Process string output to make it nicer...
- for (unsigned i = 0; i != OutStr.length(); ++i)
- if (OutStr[i] == '\n') { // Left justify
- OutStr[i] = '\\';
- OutStr.insert(OutStr.begin()+i+1, 'l');
- }
- return OutStr;
- }
- };
+ if (OutStr[0] == '\n')
+ OutStr.erase(OutStr.begin());
-} // end namespace llvm
+ // Process string output to make it nicer...
+ for (unsigned i = 0; i != OutStr.length(); ++i)
+ if (OutStr[i] == '\n') { // Left justify
+ OutStr[i] = '\\';
+ OutStr.insert(OutStr.begin() + i + 1, 'l');
+ }
+ return OutStr;
+ }
+};
void MachineFunction::viewCFG() const
{
diff --git a/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp b/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp
index 0f88a7b..5111322 100644
--- a/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp
+++ b/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp
@@ -60,13 +60,11 @@ char &llvm::MachineFunctionPrinterPassID = MachineFunctionPrinterPass::ID;
INITIALIZE_PASS(MachineFunctionPrinterPass, "machineinstr-printer",
"Machine Function Printer", false, false)
-namespace llvm {
/// Returns a newly-created MachineFunction Printer pass. The
/// default banner is empty.
///
-MachineFunctionPass *createMachineFunctionPrinterPass(raw_ostream &OS,
- const std::string &Banner){
+MachineFunctionPass *
+llvm::createMachineFunctionPrinterPass(raw_ostream &OS,
+ const std::string &Banner) {
return new MachineFunctionPrinterPass(OS, Banner);
}
-
-}
diff --git a/llvm/lib/CodeGen/MachineOutliner.cpp b/llvm/lib/CodeGen/MachineOutliner.cpp
index fdae3b4..9feb974 100644
--- a/llvm/lib/CodeGen/MachineOutliner.cpp
+++ b/llvm/lib/CodeGen/MachineOutliner.cpp
@@ -593,15 +593,12 @@ struct MachineOutliner : public ModulePass {
char MachineOutliner::ID = 0;
-namespace llvm {
-ModulePass *createMachineOutlinerPass(RunOutliner RunOutlinerMode) {
+ModulePass *llvm::createMachineOutlinerPass(RunOutliner RunOutlinerMode) {
MachineOutliner *OL = new MachineOutliner();
OL->RunOutlinerMode = RunOutlinerMode;
return OL;
}
-} // namespace llvm
-
INITIALIZE_PASS(MachineOutliner, DEBUG_TYPE, "Machine Function Outliner", false,
false)
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 89ed4da..a717d9e 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -201,16 +201,15 @@ static cl::opt<unsigned> SwpMaxNumStores(
cl::desc("Maximum number of stores allwed in the target loop."), cl::Hidden,
cl::init(200));
-namespace llvm {
-
// A command line option to enable the CopyToPhi DAG mutation.
-cl::opt<bool> SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden,
- cl::init(true),
- cl::desc("Enable CopyToPhi DAG Mutation"));
+cl::opt<bool>
+ llvm::SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden,
+ cl::init(true),
+ cl::desc("Enable CopyToPhi DAG Mutation"));
/// A command line argument to force pipeliner to use specified issue
/// width.
-cl::opt<int> SwpForceIssueWidth(
+cl::opt<int> llvm::SwpForceIssueWidth(
"pipeliner-force-issue-width",
cl::desc("Force pipeliner to use specified issue width."), cl::Hidden,
cl::init(-1));
@@ -226,8 +225,6 @@ static cl::opt<WindowSchedulingFlag> WindowSchedulingOption(
clEnumValN(WindowSchedulingFlag::WS_Force, "force",
"Use window algorithm instead of SMS algorithm.")));
-} // end namespace llvm
-
unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
char MachinePipeliner::ID = 0;
#ifndef NDEBUG
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 299bcc4..3ed1045 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -176,9 +176,7 @@ STATISTIC(NumNodeOrderPostRA,
STATISTIC(NumFirstValidPostRA,
"Number of scheduling units chosen for FirstValid heuristic post-RA");
-namespace llvm {
-
-cl::opt<MISched::Direction> PreRADirection(
+cl::opt<MISched::Direction> llvm::PreRADirection(
"misched-prera-direction", cl::Hidden,
cl::desc("Pre reg-alloc list scheduling direction"),
cl::init(MISched::Unspecified),
@@ -206,33 +204,31 @@ static cl::opt<bool>
DumpCriticalPathLength("misched-dcpl", cl::Hidden,
cl::desc("Print critical path length to stdout"));
-cl::opt<bool> VerifyScheduling(
+cl::opt<bool> llvm::VerifyScheduling(
"verify-misched", cl::Hidden,
cl::desc("Verify machine instrs before and after machine scheduling"));
#ifndef NDEBUG
-cl::opt<bool> ViewMISchedDAGs(
+cl::opt<bool> llvm::ViewMISchedDAGs(
"view-misched-dags", cl::Hidden,
cl::desc("Pop up a window to show MISched dags after they are processed"));
-cl::opt<bool> PrintDAGs("misched-print-dags", cl::Hidden,
- cl::desc("Print schedule DAGs"));
-cl::opt<bool> MISchedDumpReservedCycles(
+cl::opt<bool> llvm::PrintDAGs("misched-print-dags", cl::Hidden,
+ cl::desc("Print schedule DAGs"));
+static cl::opt<bool> MISchedDumpReservedCycles(
"misched-dump-reserved-cycles", cl::Hidden, cl::init(false),
cl::desc("Dump resource usage at schedule boundary."));
-cl::opt<bool> MischedDetailResourceBooking(
+static cl::opt<bool> MischedDetailResourceBooking(
"misched-detail-resource-booking", cl::Hidden, cl::init(false),
cl::desc("Show details of invoking getNextResoufceCycle."));
#else
-const bool ViewMISchedDAGs = false;
-const bool PrintDAGs = false;
-const bool MischedDetailResourceBooking = false;
+const bool llvm::ViewMISchedDAGs = false;
+const bool llvm::PrintDAGs = false;
+static const bool MischedDetailResourceBooking = false;
#ifdef LLVM_ENABLE_DUMP
-const bool MISchedDumpReservedCycles = false;
+static const bool MISchedDumpReservedCycles = false;
#endif // LLVM_ENABLE_DUMP
#endif // NDEBUG
-} // end namespace llvm
-
#ifndef NDEBUG
/// In some situations a few uninteresting nodes depend on nearly all other
/// nodes in the graph, provide a cutoff to hide them.
@@ -2053,28 +2049,24 @@ public:
} // end anonymous namespace
-namespace llvm {
-
std::unique_ptr<ScheduleDAGMutation>
-createLoadClusterDAGMutation(const TargetInstrInfo *TII,
- const TargetRegisterInfo *TRI,
- bool ReorderWhileClustering) {
+llvm::createLoadClusterDAGMutation(const TargetInstrInfo *TII,
+ const TargetRegisterInfo *TRI,
+ bool ReorderWhileClustering) {
return EnableMemOpCluster ? std::make_unique<LoadClusterMutation>(
TII, TRI, ReorderWhileClustering)
: nullptr;
}
std::unique_ptr<ScheduleDAGMutation>
-createStoreClusterDAGMutation(const TargetInstrInfo *TII,
- const TargetRegisterInfo *TRI,
- bool ReorderWhileClustering) {
+llvm::createStoreClusterDAGMutation(const TargetInstrInfo *TII,
+ const TargetRegisterInfo *TRI,
+ bool ReorderWhileClustering) {
return EnableMemOpCluster ? std::make_unique<StoreClusterMutation>(
TII, TRI, ReorderWhileClustering)
: nullptr;
}
-} // end namespace llvm
-
// Sorting all the loads/stores first, then for each load/store, checking the
// following load/store one by one, until reach the first non-dependent one and
// call target hook to see if they can cluster.
@@ -2304,16 +2296,12 @@ protected:
} // end anonymous namespace
-namespace llvm {
-
std::unique_ptr<ScheduleDAGMutation>
-createCopyConstrainDAGMutation(const TargetInstrInfo *TII,
- const TargetRegisterInfo *TRI) {
+llvm::createCopyConstrainDAGMutation(const TargetInstrInfo *TII,
+ const TargetRegisterInfo *TRI) {
return std::make_unique<CopyConstrain>(TII, TRI);
}
-} // end namespace llvm
-
/// constrainLocalCopy handles two possibilities:
/// 1) Local src:
/// I0: = dst
@@ -3445,14 +3433,13 @@ void GenericSchedulerBase::traceCandidate(const SchedCandidate &Cand) {
}
#endif
-namespace llvm {
/// Return true if this heuristic determines order.
/// TODO: Consider refactor return type of these functions as integer or enum,
/// as we may need to differentiate whether TryCand is better than Cand.
-bool tryLess(int TryVal, int CandVal,
- GenericSchedulerBase::SchedCandidate &TryCand,
- GenericSchedulerBase::SchedCandidate &Cand,
- GenericSchedulerBase::CandReason Reason) {
+bool llvm::tryLess(int TryVal, int CandVal,
+ GenericSchedulerBase::SchedCandidate &TryCand,
+ GenericSchedulerBase::SchedCandidate &Cand,
+ GenericSchedulerBase::CandReason Reason) {
if (TryVal < CandVal) {
TryCand.Reason = Reason;
return true;
@@ -3465,10 +3452,10 @@ bool tryLess(int TryVal, int CandVal,
return false;
}
-bool tryGreater(int TryVal, int CandVal,
- GenericSchedulerBase::SchedCandidate &TryCand,
- GenericSchedulerBase::SchedCandidate &Cand,
- GenericSchedulerBase::CandReason Reason) {
+bool llvm::tryGreater(int TryVal, int CandVal,
+ GenericSchedulerBase::SchedCandidate &TryCand,
+ GenericSchedulerBase::SchedCandidate &Cand,
+ GenericSchedulerBase::CandReason Reason) {
if (TryVal > CandVal) {
TryCand.Reason = Reason;
return true;
@@ -3481,9 +3468,9 @@ bool tryGreater(int TryVal, int CandVal,
return false;
}
-bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,
- GenericSchedulerBase::SchedCandidate &Cand,
- SchedBoundary &Zone) {
+bool llvm::tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,
+ GenericSchedulerBase::SchedCandidate &Cand,
+ SchedBoundary &Zone) {
if (Zone.isTop()) {
// Prefer the candidate with the lesser depth, but only if one of them has
// depth greater than the total latency scheduled so far, otherwise either
@@ -3513,7 +3500,6 @@ bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,
}
return false;
}
-} // end namespace llvm
static void tracePick(GenericSchedulerBase::CandReason Reason, bool IsTop,
bool IsPostRA = false) {
@@ -3798,14 +3784,12 @@ void GenericScheduler::registerRoots() {
}
}
-namespace llvm {
-bool tryPressure(const PressureChange &TryP,
- const PressureChange &CandP,
- GenericSchedulerBase::SchedCandidate &TryCand,
- GenericSchedulerBase::SchedCandidate &Cand,
- GenericSchedulerBase::CandReason Reason,
- const TargetRegisterInfo *TRI,
- const MachineFunction &MF) {
+bool llvm::tryPressure(const PressureChange &TryP, const PressureChange &CandP,
+ GenericSchedulerBase::SchedCandidate &TryCand,
+ GenericSchedulerBase::SchedCandidate &Cand,
+ GenericSchedulerBase::CandReason Reason,
+ const TargetRegisterInfo *TRI,
+ const MachineFunction &MF) {
// If one candidate decreases and the other increases, go with it.
// Invalid candidates have UnitInc==0.
if (tryGreater(TryP.getUnitInc() < 0, CandP.getUnitInc() < 0, TryCand, Cand,
@@ -3838,7 +3822,7 @@ bool tryPressure(const PressureChange &TryP,
return tryGreater(TryRank, CandRank, TryCand, Cand, Reason);
}
-unsigned getWeakLeft(const SUnit *SU, bool isTop) {
+unsigned llvm::getWeakLeft(const SUnit *SU, bool isTop) {
return (isTop) ? SU->WeakPredsLeft : SU->WeakSuccsLeft;
}
@@ -3849,7 +3833,7 @@ unsigned getWeakLeft(const SUnit *SU, bool isTop) {
/// copies which can be prescheduled. The rest (e.g. x86 MUL) could be bundled
/// with the operation that produces or consumes the physreg. We'll do this when
/// regalloc has support for parallel copies.
-int biasPhysReg(const SUnit *SU, bool isTop) {
+int llvm::biasPhysReg(const SUnit *SU, bool isTop) {
const MachineInstr *MI = SU->getInstr();
if (MI->isCopy()) {
@@ -3884,7 +3868,6 @@ int biasPhysReg(const SUnit *SU, bool isTop) {
return 0;
}
-} // end namespace llvm
void GenericScheduler::initCandidate(SchedCandidate &Cand, SUnit *SU,
bool AtTop,
@@ -4812,13 +4795,13 @@ static MachineSchedRegistry ShufflerRegistry(
//===----------------------------------------------------------------------===//
#ifndef NDEBUG
-namespace llvm {
-template<> struct GraphTraits<
- ScheduleDAGMI*> : public GraphTraits<ScheduleDAG*> {};
+template <>
+struct llvm::GraphTraits<ScheduleDAGMI *> : public GraphTraits<ScheduleDAG *> {
+};
-template<>
-struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits {
+template <>
+struct llvm::DOTGraphTraits<ScheduleDAGMI *> : public DefaultDOTGraphTraits {
DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
static std::string getGraphName(const ScheduleDAG *G) {
@@ -4878,7 +4861,6 @@ struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits {
}
};
-} // end namespace llvm
#endif // NDEBUG
/// viewGraph - Pop up a ghostview window with the reachable parts of the DAG
diff --git a/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
index c2d4aa0..9ac3f741 100644
--- a/llvm/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
@@ -485,10 +485,7 @@ struct LoopBounds {
// Specialize po_iterator_storage in order to prune the post-order traversal so
// it is limited to the current loop and doesn't traverse the loop back edges.
-namespace llvm {
-
-template<>
-class po_iterator_storage<LoopBounds, true> {
+template <> class llvm::po_iterator_storage<LoopBounds, true> {
LoopBounds &LB;
public:
@@ -519,8 +516,6 @@ public:
}
};
-} // end namespace llvm
-
/// Compute the trace through MBB.
void MachineTraceMetrics::Ensemble::computeTrace(const MachineBasicBlock *MBB) {
LLVM_DEBUG(dbgs() << "Computing " << getName() << " trace through "
diff --git a/llvm/lib/CodeGen/NonRelocatableStringpool.cpp b/llvm/lib/CodeGen/NonRelocatableStringpool.cpp
index 087ac62..59c587c 100644
--- a/llvm/lib/CodeGen/NonRelocatableStringpool.cpp
+++ b/llvm/lib/CodeGen/NonRelocatableStringpool.cpp
@@ -9,7 +9,7 @@
#include "llvm/CodeGen/NonRelocatableStringpool.h"
#include "llvm/ADT/STLExtras.h"
-namespace llvm {
+using namespace llvm;
DwarfStringPoolEntryRef NonRelocatableStringpool::getEntry(StringRef S) {
auto I = Strings.try_emplace(S);
@@ -43,5 +43,3 @@ NonRelocatableStringpool::getEntriesForEmission() const {
});
return Result;
}
-
-} // namespace llvm
diff --git a/llvm/lib/CodeGen/SafeStack.cpp b/llvm/lib/CodeGen/SafeStack.cpp
index 6f373a5..e9ffa85 100644
--- a/llvm/lib/CodeGen/SafeStack.cpp
+++ b/llvm/lib/CodeGen/SafeStack.cpp
@@ -76,8 +76,6 @@ using namespace llvm::safestack;
#define DEBUG_TYPE "safe-stack"
-namespace llvm {
-
STATISTIC(NumFunctions, "Total number of functions");
STATISTIC(NumUnsafeStackFunctions, "Number of functions with unsafe stack");
STATISTIC(NumUnsafeStackRestorePointsFunctions,
@@ -89,8 +87,6 @@ STATISTIC(NumUnsafeDynamicAllocas, "Number of unsafe dynamic allocas");
STATISTIC(NumUnsafeByValArguments, "Number of unsafe byval arguments");
STATISTIC(NumUnsafeStackRestorePoints, "Number of setjmps and landingpads");
-} // namespace llvm
-
/// Use __safestack_pointer_address even if the platform has a faster way of
/// access safe stack pointer.
static cl::opt<bool>
diff --git a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
index eae2e8c..9662511 100644
--- a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -1551,14 +1551,10 @@ LLVM_DUMP_METHOD void ILPValue::dump() const {
dbgs() << *this << '\n';
}
-namespace llvm {
-
-LLVM_ATTRIBUTE_UNUSED
-raw_ostream &operator<<(raw_ostream &OS, const ILPValue &Val) {
+[[maybe_unused]]
+raw_ostream &llvm::operator<<(raw_ostream &OS, const ILPValue &Val) {
Val.print(OS);
return OS;
}
-} // end namespace llvm
-
#endif
diff --git a/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp b/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp
index e7b1494..c80eade 100644
--- a/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp
@@ -16,57 +16,51 @@
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
-namespace llvm {
- template<>
- struct DOTGraphTraits<ScheduleDAG*> : public DefaultDOTGraphTraits {
+template <>
+struct llvm::DOTGraphTraits<ScheduleDAG *> : public DefaultDOTGraphTraits {
- DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {}
+ DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
- static std::string getGraphName(const ScheduleDAG *G) {
- return std::string(G->MF.getName());
- }
+ static std::string getGraphName(const ScheduleDAG *G) {
+ return std::string(G->MF.getName());
+ }
- static bool renderGraphFromBottomUp() {
- return true;
- }
+ static bool renderGraphFromBottomUp() { return true; }
- static bool isNodeHidden(const SUnit *Node, const ScheduleDAG *G) {
- return (Node->NumPreds > 10 || Node->NumSuccs > 10);
- }
+ static bool isNodeHidden(const SUnit *Node, const ScheduleDAG *G) {
+ return (Node->NumPreds > 10 || Node->NumSuccs > 10);
+ }
- static std::string getNodeIdentifierLabel(const SUnit *Node,
- const ScheduleDAG *Graph) {
- std::string R;
- raw_string_ostream OS(R);
- OS << static_cast<const void *>(Node);
- return R;
- }
+ static std::string getNodeIdentifierLabel(const SUnit *Node,
+ const ScheduleDAG *Graph) {
+ std::string R;
+ raw_string_ostream OS(R);
+ OS << static_cast<const void *>(Node);
+ return R;
+ }
- /// If you want to override the dot attributes printed for a particular
- /// edge, override this method.
- static std::string getEdgeAttributes(const SUnit *Node,
- SUnitIterator EI,
- const ScheduleDAG *Graph) {
- if (EI.isArtificialDep())
- return "color=cyan,style=dashed";
- if (EI.isCtrlDep())
- return "color=blue,style=dashed";
- return "";
- }
+ /// If you want to override the dot attributes printed for a particular
+ /// edge, override this method.
+ static std::string getEdgeAttributes(const SUnit *Node, SUnitIterator EI,
+ const ScheduleDAG *Graph) {
+ if (EI.isArtificialDep())
+ return "color=cyan,style=dashed";
+ if (EI.isCtrlDep())
+ return "color=blue,style=dashed";
+ return "";
+ }
+ std::string getNodeLabel(const SUnit *SU, const ScheduleDAG *Graph);
+ static std::string getNodeAttributes(const SUnit *N,
+ const ScheduleDAG *Graph) {
+ return "shape=Mrecord";
+ }
- std::string getNodeLabel(const SUnit *SU, const ScheduleDAG *Graph);
- static std::string getNodeAttributes(const SUnit *N,
- const ScheduleDAG *Graph) {
- return "shape=Mrecord";
- }
-
- static void addCustomGraphFeatures(ScheduleDAG *G,
- GraphWriter<ScheduleDAG*> &GW) {
- return G->addCustomGraphFeatures(GW);
- }
- };
-}
+ static void addCustomGraphFeatures(ScheduleDAG *G,
+ GraphWriter<ScheduleDAG *> &GW) {
+ return G->addCustomGraphFeatures(GW);
+ }
+};
std::string DOTGraphTraits<ScheduleDAG*>::getNodeLabel(const SUnit *SU,
const ScheduleDAG *G) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b1accdd..c97300d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -509,6 +509,7 @@ namespace {
SDValue visitFMUL(SDNode *N);
template <class MatchContextClass> SDValue visitFMA(SDNode *N);
SDValue visitFMAD(SDNode *N);
+ SDValue visitFMULADD(SDNode *N);
SDValue visitFDIV(SDNode *N);
SDValue visitFREM(SDNode *N);
SDValue visitFSQRT(SDNode *N);
@@ -657,13 +658,13 @@ namespace {
bool InexpensiveOnly = false,
std::optional<EVT> OutVT = std::nullopt);
SDValue BuildDivEstimate(SDValue N, SDValue Op, SDNodeFlags Flags);
- SDValue buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags);
- SDValue buildSqrtEstimate(SDValue Op, SDNodeFlags Flags);
- SDValue buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags, bool Recip);
+ SDValue buildRsqrtEstimate(SDValue Op);
+ SDValue buildSqrtEstimate(SDValue Op);
+ SDValue buildSqrtEstimateImpl(SDValue Op, bool Recip);
SDValue buildSqrtNROneConst(SDValue Arg, SDValue Est, unsigned Iterations,
- SDNodeFlags Flags, bool Reciprocal);
+ bool Reciprocal);
SDValue buildSqrtNRTwoConst(SDValue Arg, SDValue Est, unsigned Iterations,
- SDNodeFlags Flags, bool Reciprocal);
+ bool Reciprocal);
SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
bool DemandHighBits = true);
SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1);
@@ -1991,6 +1992,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
case ISD::FMUL: return visitFMUL(N);
case ISD::FMA: return visitFMA<EmptyMatchContext>(N);
case ISD::FMAD: return visitFMAD(N);
+ case ISD::FMULADD: return visitFMULADD(N);
case ISD::FDIV: return visitFDIV(N);
case ISD::FREM: return visitFREM(N);
case ISD::FSQRT: return visitFSQRT(N);
@@ -5042,7 +5044,6 @@ static SDValue simplifyDivRem(SDNode *N, SelectionDAG &DAG) {
unsigned Opc = N->getOpcode();
bool IsDiv = (ISD::SDIV == Opc) || (ISD::UDIV == Opc);
- ConstantSDNode *N1C = isConstOrConstSplat(N1);
// X / undef -> undef
// X % undef -> undef
@@ -5074,7 +5075,7 @@ static SDValue simplifyDivRem(SDNode *N, SelectionDAG &DAG) {
// division-by-zero or remainder-by-zero, so assume the divisor is 1.
// TODO: Similarly, if we're zero-extending a boolean divisor, then assume
// it's a 1.
- if ((N1C && N1C->isOne()) || (VT.getScalarType() == MVT::i1))
+ if (isOneOrOneSplat(N1) || (VT.getScalarType() == MVT::i1))
return IsDiv ? N0 : DAG.getConstant(0, DL, VT);
return SDValue();
@@ -17758,7 +17759,6 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
bool N1CFP = DAG.isConstantFPBuildVectorOrConstantFP(N1);
EVT VT = N->getValueType(0);
SDLoc DL(N);
- const TargetOptions &Options = DAG.getTarget().Options;
SDNodeFlags Flags = N->getFlags();
SelectionDAG::FlagInserter FlagsInserter(DAG, N);
@@ -17824,7 +17824,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
bool AllowNewConst = (Level < AfterLegalizeDAG);
// If nnan is enabled, fold lots of things.
- if ((Options.NoNaNsFPMath || Flags.hasNoNaNs()) && AllowNewConst) {
+ if (Flags.hasNoNaNs() && AllowNewConst) {
// If allowed, fold (fadd (fneg x), x) -> 0.0
if (N0.getOpcode() == ISD::FNEG && N0.getOperand(0) == N1)
return DAG.getConstantFP(0.0, DL, VT);
@@ -17973,7 +17973,6 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1, true);
EVT VT = N->getValueType(0);
SDLoc DL(N);
- const TargetOptions &Options = DAG.getTarget().Options;
const SDNodeFlags Flags = N->getFlags();
SelectionDAG::FlagInserter FlagsInserter(DAG, N);
@@ -18001,7 +18000,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
if (N0 == N1) {
// (fsub x, x) -> 0.0
- if (Options.NoNaNsFPMath || Flags.hasNoNaNs())
+ if (Flags.hasNoNaNs())
return DAG.getConstantFP(0.0f, DL, VT);
}
@@ -18312,7 +18311,6 @@ template <class MatchContextClass> SDValue DAGCombiner::visitFMA(SDNode *N) {
ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
EVT VT = N->getValueType(0);
SDLoc DL(N);
- const TargetOptions &Options = DAG.getTarget().Options;
// FMA nodes have flags that propagate to the created nodes.
SelectionDAG::FlagInserter FlagsInserter(DAG, N);
MatchContextClass matcher(DAG, TLI, N);
@@ -18338,8 +18336,7 @@ template <class MatchContextClass> SDValue DAGCombiner::visitFMA(SDNode *N) {
return matcher.getNode(ISD::FMA, DL, VT, NegN0, NegN1, N2);
}
- if ((Options.NoNaNsFPMath && N->getFlags().hasNoInfs()) ||
- (N->getFlags().hasNoNaNs() && N->getFlags().hasNoInfs())) {
+ if (N->getFlags().hasNoNaNs() && N->getFlags().hasNoInfs()) {
if (N->getFlags().hasNoSignedZeros() ||
(N2CFP && !N2CFP->isExactlyValue(-0.0))) {
if (N0CFP && N0CFP->isZero())
@@ -18444,6 +18441,21 @@ SDValue DAGCombiner::visitFMAD(SDNode *N) {
return SDValue();
}
+SDValue DAGCombiner::visitFMULADD(SDNode *N) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDValue N2 = N->getOperand(2);
+ EVT VT = N->getValueType(0);
+ SDLoc DL(N);
+
+ // Constant fold FMULADD.
+ if (SDValue C =
+ DAG.FoldConstantArithmetic(ISD::FMULADD, DL, VT, {N0, N1, N2}))
+ return C;
+
+ return SDValue();
+}
+
// Combine multiple FDIVs with the same divisor into multiple FMULs by the
// reciprocal.
// E.g., (a / D; b / D;) -> (recip = 1.0 / D; a * recip; b * recip)
@@ -18574,20 +18586,18 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
// If this FDIV is part of a reciprocal square root, it may be folded
// into a target-specific square root estimate instruction.
if (N1.getOpcode() == ISD::FSQRT) {
- if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0), Flags))
+ if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0)))
return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
} else if (N1.getOpcode() == ISD::FP_EXTEND &&
N1.getOperand(0).getOpcode() == ISD::FSQRT) {
- if (SDValue RV =
- buildRsqrtEstimate(N1.getOperand(0).getOperand(0), Flags)) {
+ if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0).getOperand(0))) {
RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N1), VT, RV);
AddToWorklist(RV.getNode());
return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
}
} else if (N1.getOpcode() == ISD::FP_ROUND &&
N1.getOperand(0).getOpcode() == ISD::FSQRT) {
- if (SDValue RV =
- buildRsqrtEstimate(N1.getOperand(0).getOperand(0), Flags)) {
+ if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0).getOperand(0))) {
RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N1), VT, RV, N1.getOperand(1));
AddToWorklist(RV.getNode());
return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
@@ -18619,7 +18629,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
SDValue AA = DAG.getNode(ISD::FMUL, DL, VT, A, A);
SDValue AAZ =
DAG.getNode(ISD::FMUL, DL, VT, AA, Sqrt.getOperand(0));
- if (SDValue Rsqrt = buildRsqrtEstimate(AAZ, Flags))
+ if (SDValue Rsqrt = buildRsqrtEstimate(AAZ))
return DAG.getNode(ISD::FMUL, DL, VT, N0, Rsqrt);
// Estimate creation failed. Clean up speculatively created nodes.
@@ -18629,7 +18639,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
// We found a FSQRT, so try to make this fold:
// X / (Y * sqrt(Z)) -> X * (rsqrt(Z) / Y)
- if (SDValue Rsqrt = buildRsqrtEstimate(Sqrt.getOperand(0), Flags)) {
+ if (SDValue Rsqrt = buildRsqrtEstimate(Sqrt.getOperand(0))) {
SDValue Div = DAG.getNode(ISD::FDIV, SDLoc(N1), VT, Rsqrt, Y);
AddToWorklist(Div.getNode());
return DAG.getNode(ISD::FMUL, DL, VT, N0, Div);
@@ -18726,11 +18736,12 @@ SDValue DAGCombiner::visitFSQRT(SDNode *N) {
return SDValue();
// FSQRT nodes have flags that propagate to the created nodes.
+ SelectionDAG::FlagInserter FlagInserter(DAG, Flags);
// TODO: If this is N0/sqrt(N0), and we reach this node before trying to
// transform the fdiv, we may produce a sub-optimal estimate sequence
// because the reciprocal calculation may not have to filter out a
// 0.0 input.
- return buildSqrtEstimate(N0, Flags);
+ return buildSqrtEstimate(N0);
}
/// copysign(x, fp_extend(y)) -> copysign(x, y)
@@ -29727,28 +29738,27 @@ SDValue DAGCombiner::BuildDivEstimate(SDValue N, SDValue Op,
/// X_{i+1} = X_i (1.5 - A X_i^2 / 2)
/// As a result, we precompute A/2 prior to the iteration loop.
SDValue DAGCombiner::buildSqrtNROneConst(SDValue Arg, SDValue Est,
- unsigned Iterations,
- SDNodeFlags Flags, bool Reciprocal) {
+ unsigned Iterations, bool Reciprocal) {
EVT VT = Arg.getValueType();
SDLoc DL(Arg);
SDValue ThreeHalves = DAG.getConstantFP(1.5, DL, VT);
// We now need 0.5 * Arg which we can write as (1.5 * Arg - Arg) so that
// this entire sequence requires only one FP constant.
- SDValue HalfArg = DAG.getNode(ISD::FMUL, DL, VT, ThreeHalves, Arg, Flags);
- HalfArg = DAG.getNode(ISD::FSUB, DL, VT, HalfArg, Arg, Flags);
+ SDValue HalfArg = DAG.getNode(ISD::FMUL, DL, VT, ThreeHalves, Arg);
+ HalfArg = DAG.getNode(ISD::FSUB, DL, VT, HalfArg, Arg);
// Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est)
for (unsigned i = 0; i < Iterations; ++i) {
- SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, Est, Flags);
- NewEst = DAG.getNode(ISD::FMUL, DL, VT, HalfArg, NewEst, Flags);
- NewEst = DAG.getNode(ISD::FSUB, DL, VT, ThreeHalves, NewEst, Flags);
- Est = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags);
+ SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, Est);
+ NewEst = DAG.getNode(ISD::FMUL, DL, VT, HalfArg, NewEst);
+ NewEst = DAG.getNode(ISD::FSUB, DL, VT, ThreeHalves, NewEst);
+ Est = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst);
}
// If non-reciprocal square root is requested, multiply the result by Arg.
if (!Reciprocal)
- Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Arg, Flags);
+ Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Arg);
return Est;
}
@@ -29759,8 +29769,7 @@ SDValue DAGCombiner::buildSqrtNROneConst(SDValue Arg, SDValue Est,
/// =>
/// X_{i+1} = (-0.5 * X_i) * (A * X_i * X_i + (-3.0))
SDValue DAGCombiner::buildSqrtNRTwoConst(SDValue Arg, SDValue Est,
- unsigned Iterations,
- SDNodeFlags Flags, bool Reciprocal) {
+ unsigned Iterations, bool Reciprocal) {
EVT VT = Arg.getValueType();
SDLoc DL(Arg);
SDValue MinusThree = DAG.getConstantFP(-3.0, DL, VT);
@@ -29773,9 +29782,9 @@ SDValue DAGCombiner::buildSqrtNRTwoConst(SDValue Arg, SDValue Est,
// Newton iterations for reciprocal square root:
// E = (E * -0.5) * ((A * E) * E + -3.0)
for (unsigned i = 0; i < Iterations; ++i) {
- SDValue AE = DAG.getNode(ISD::FMUL, DL, VT, Arg, Est, Flags);
- SDValue AEE = DAG.getNode(ISD::FMUL, DL, VT, AE, Est, Flags);
- SDValue RHS = DAG.getNode(ISD::FADD, DL, VT, AEE, MinusThree, Flags);
+ SDValue AE = DAG.getNode(ISD::FMUL, DL, VT, Arg, Est);
+ SDValue AEE = DAG.getNode(ISD::FMUL, DL, VT, AE, Est);
+ SDValue RHS = DAG.getNode(ISD::FADD, DL, VT, AEE, MinusThree);
// When calculating a square root at the last iteration build:
// S = ((A * E) * -0.5) * ((A * E) * E + -3.0)
@@ -29783,13 +29792,13 @@ SDValue DAGCombiner::buildSqrtNRTwoConst(SDValue Arg, SDValue Est,
SDValue LHS;
if (Reciprocal || (i + 1) < Iterations) {
// RSQRT: LHS = (E * -0.5)
- LHS = DAG.getNode(ISD::FMUL, DL, VT, Est, MinusHalf, Flags);
+ LHS = DAG.getNode(ISD::FMUL, DL, VT, Est, MinusHalf);
} else {
// SQRT: LHS = (A * E) * -0.5
- LHS = DAG.getNode(ISD::FMUL, DL, VT, AE, MinusHalf, Flags);
+ LHS = DAG.getNode(ISD::FMUL, DL, VT, AE, MinusHalf);
}
- Est = DAG.getNode(ISD::FMUL, DL, VT, LHS, RHS, Flags);
+ Est = DAG.getNode(ISD::FMUL, DL, VT, LHS, RHS);
}
return Est;
@@ -29798,8 +29807,7 @@ SDValue DAGCombiner::buildSqrtNRTwoConst(SDValue Arg, SDValue Est,
/// Build code to calculate either rsqrt(Op) or sqrt(Op). In the latter case
/// Op*rsqrt(Op) is actually computed, so additional postprocessing is needed if
/// Op can be zero.
-SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags,
- bool Reciprocal) {
+SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, bool Reciprocal) {
if (LegalDAG)
return SDValue();
@@ -29827,8 +29835,8 @@ SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags,
if (Iterations > 0)
Est = UseOneConstNR
- ? buildSqrtNROneConst(Op, Est, Iterations, Flags, Reciprocal)
- : buildSqrtNRTwoConst(Op, Est, Iterations, Flags, Reciprocal);
+ ? buildSqrtNROneConst(Op, Est, Iterations, Reciprocal)
+ : buildSqrtNRTwoConst(Op, Est, Iterations, Reciprocal);
if (!Reciprocal) {
SDLoc DL(Op);
// Try the target specific test first.
@@ -29846,12 +29854,12 @@ SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags,
return SDValue();
}
-SDValue DAGCombiner::buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags) {
- return buildSqrtEstimateImpl(Op, Flags, true);
+SDValue DAGCombiner::buildRsqrtEstimate(SDValue Op) {
+ return buildSqrtEstimateImpl(Op, true);
}
-SDValue DAGCombiner::buildSqrtEstimate(SDValue Op, SDNodeFlags Flags) {
- return buildSqrtEstimateImpl(Op, Flags, false);
+SDValue DAGCombiner::buildSqrtEstimate(SDValue Op) {
+ return buildSqrtEstimateImpl(Op, false);
}
/// Return true if there is any possibility that the two addresses overlap.
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 08af74c..90edaf3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5063,8 +5063,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
break;
case ISD::ADD:
case ISD::ADDC:
- // Add can have at most one carry bit. Thus we know that the output
- // is, at worst, one more bit than the inputs.
+ // TODO: Move Operand 1 check before Operand 0 check
Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
if (Tmp == 1) return 1; // Early out.
@@ -5088,6 +5087,9 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
Tmp2 = ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
if (Tmp2 == 1) return 1; // Early out.
+
+ // Add can have at most one carry bit. Thus we know that the output
+ // is, at worst, one more bit than the inputs.
return std::min(Tmp, Tmp2) - 1;
case ISD::SUB:
Tmp2 = ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
@@ -5786,6 +5788,7 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
case ISD::FCOPYSIGN:
case ISD::FMA:
case ISD::FMAD:
+ case ISD::FMULADD:
case ISD::FP_EXTEND:
case ISD::FP_TO_SINT_SAT:
case ISD::FP_TO_UINT_SAT:
@@ -5904,6 +5907,7 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, const APInt &DemandedElts,
case ISD::FCOSH:
case ISD::FTANH:
case ISD::FMA:
+ case ISD::FMULADD:
case ISD::FMAD: {
if (SNaN)
return true;
@@ -6401,8 +6405,9 @@ static SDValue foldCONCAT_VECTORS(const SDLoc &DL, EVT VT,
if (VT.isScalableVector())
return SDValue();
- // A CONCAT_VECTOR with all UNDEF/BUILD_VECTOR operands can be
- // simplified to one big BUILD_VECTOR.
+ // A CONCAT_VECTOR of scalar sources, such as UNDEF, BUILD_VECTOR and
+ // single-element INSERT_VECTOR_ELT operands can be simplified to one big
+ // BUILD_VECTOR.
// FIXME: Add support for SCALAR_TO_VECTOR as well.
EVT SVT = VT.getScalarType();
SmallVector<SDValue, 16> Elts;
@@ -6412,6 +6417,10 @@ static SDValue foldCONCAT_VECTORS(const SDLoc &DL, EVT VT,
Elts.append(OpVT.getVectorNumElements(), DAG.getUNDEF(SVT));
else if (Op.getOpcode() == ISD::BUILD_VECTOR)
Elts.append(Op->op_begin(), Op->op_end());
+ else if (Op.getOpcode() == ISD::INSERT_VECTOR_ELT &&
+ OpVT.getVectorNumElements() == 1 &&
+ isNullConstant(Op.getOperand(2)))
+ Elts.push_back(Op.getOperand(1));
else
return SDValue();
}
@@ -7231,7 +7240,7 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
}
// Handle fma/fmad special cases.
- if (Opcode == ISD::FMA || Opcode == ISD::FMAD) {
+ if (Opcode == ISD::FMA || Opcode == ISD::FMAD || Opcode == ISD::FMULADD) {
assert(VT.isFloatingPoint() && "This operator only applies to FP types!");
assert(Ops[0].getValueType() == VT && Ops[1].getValueType() == VT &&
Ops[2].getValueType() == VT && "FMA types must match!");
@@ -7242,7 +7251,7 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
APFloat V1 = C1->getValueAPF();
const APFloat &V2 = C2->getValueAPF();
const APFloat &V3 = C3->getValueAPF();
- if (Opcode == ISD::FMAD) {
+ if (Opcode == ISD::FMAD || Opcode == ISD::FMULADD) {
V1.multiply(V2, APFloat::rmNearestTiesToEven);
V1.add(V3, APFloat::rmNearestTiesToEven);
} else
@@ -8781,7 +8790,7 @@ static SDValue getMemcpyLoadsAndStores(
if (Value.getNode()) {
Store = DAG.getStore(
Chain, dl, Value,
- DAG.getMemBasePlusOffset(Dst, TypeSize::getFixed(DstOff), dl),
+ DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff)),
DstPtrInfo.getWithOffset(DstOff), Alignment, MMOFlags, NewAAInfo);
OutChains.push_back(Store);
}
@@ -8797,7 +8806,7 @@ static SDValue getMemcpyLoadsAndStores(
assert(NVT.bitsGE(VT));
bool isDereferenceable =
- SrcPtrInfo.getWithOffset(SrcOff).isDereferenceable(VTSize, C, DL);
+ SrcPtrInfo.getWithOffset(SrcOff).isDereferenceable(VTSize, C, DL);
MachineMemOperand::Flags SrcMMOFlags = MMOFlags;
if (isDereferenceable)
SrcMMOFlags |= MachineMemOperand::MODereferenceable;
@@ -8806,14 +8815,14 @@ static SDValue getMemcpyLoadsAndStores(
Value = DAG.getExtLoad(
ISD::EXTLOAD, dl, NVT, Chain,
- DAG.getMemBasePlusOffset(Src, TypeSize::getFixed(SrcOff), dl),
+ DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(SrcOff)),
SrcPtrInfo.getWithOffset(SrcOff), VT,
commonAlignment(*SrcAlign, SrcOff), SrcMMOFlags, NewAAInfo);
OutLoadChains.push_back(Value.getValue(1));
Store = DAG.getTruncStore(
Chain, dl, Value,
- DAG.getMemBasePlusOffset(Dst, TypeSize::getFixed(DstOff), dl),
+ DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff)),
DstPtrInfo.getWithOffset(DstOff), VT, Alignment, MMOFlags, NewAAInfo);
OutStoreChains.push_back(Store);
}
@@ -8943,14 +8952,14 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
SDValue Value;
bool isDereferenceable =
- SrcPtrInfo.getWithOffset(SrcOff).isDereferenceable(VTSize, C, DL);
+ SrcPtrInfo.getWithOffset(SrcOff).isDereferenceable(VTSize, C, DL);
MachineMemOperand::Flags SrcMMOFlags = MMOFlags;
if (isDereferenceable)
SrcMMOFlags |= MachineMemOperand::MODereferenceable;
Value = DAG.getLoad(
VT, dl, Chain,
- DAG.getMemBasePlusOffset(Src, TypeSize::getFixed(SrcOff), dl),
+ DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(SrcOff)),
SrcPtrInfo.getWithOffset(SrcOff), *SrcAlign, SrcMMOFlags, NewAAInfo);
LoadValues.push_back(Value);
LoadChains.push_back(Value.getValue(1));
@@ -8965,7 +8974,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
Store = DAG.getStore(
Chain, dl, LoadValues[i],
- DAG.getMemBasePlusOffset(Dst, TypeSize::getFixed(DstOff), dl),
+ DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff)),
DstPtrInfo.getWithOffset(DstOff), Alignment, MMOFlags, NewAAInfo);
OutChains.push_back(Store);
DstOff += VTSize;
@@ -9097,7 +9106,7 @@ static SDValue getMemsetStores(SelectionDAG &DAG, const SDLoc &dl,
assert(Value.getValueType() == VT && "Value with wrong type.");
SDValue Store = DAG.getStore(
Chain, dl, Value,
- DAG.getMemBasePlusOffset(Dst, TypeSize::getFixed(DstOff), dl),
+ DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff)),
DstPtrInfo.getWithOffset(DstOff), Alignment,
isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone,
NewAAInfo);
@@ -11844,25 +11853,38 @@ SDValue SelectionDAG::getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT,
/// getNodeIfExists - Get the specified node if it's already available, or
/// else return NULL.
SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList,
- ArrayRef<SDValue> Ops) {
+ ArrayRef<SDValue> Ops,
+ bool AllowCommute) {
SDNodeFlags Flags;
if (Inserter)
Flags = Inserter->getFlags();
- return getNodeIfExists(Opcode, VTList, Ops, Flags);
+ return getNodeIfExists(Opcode, VTList, Ops, Flags, AllowCommute);
}
SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList,
ArrayRef<SDValue> Ops,
- const SDNodeFlags Flags) {
- if (VTList.VTs[VTList.NumVTs - 1] != MVT::Glue) {
+ const SDNodeFlags Flags,
+ bool AllowCommute) {
+ if (VTList.VTs[VTList.NumVTs - 1] == MVT::Glue)
+ return nullptr;
+
+ auto Lookup = [&](ArrayRef<SDValue> LookupOps) -> SDNode * {
FoldingSetNodeID ID;
- AddNodeIDNode(ID, Opcode, VTList, Ops);
+ AddNodeIDNode(ID, Opcode, VTList, LookupOps);
void *IP = nullptr;
- if (SDNode *E = FindNodeOrInsertPos(ID, SDLoc(), IP)) {
+ if (SDNode *E = FindNodeOrInsertPos(ID, IP)) {
E->intersectFlagsWith(Flags);
return E;
}
- }
+ return nullptr;
+ };
+
+ if (SDNode *Existing = Lookup(Ops))
+ return Existing;
+
+ if (AllowCommute && TLI->isCommutativeBinOp(Opcode))
+ return Lookup({Ops[1], Ops[0]});
+
return nullptr;
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index c21890a..cb0038c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -3971,8 +3971,14 @@ void SelectionDAGBuilder::visitSIToFP(const User &I) {
}
void SelectionDAGBuilder::visitPtrToAddr(const User &I) {
- // FIXME: this is not correct for pointers with addr width != pointer width
- visitPtrToInt(I);
+ SDValue N = getValue(I.getOperand(0));
+ // By definition the type of the ptrtoaddr must be equal to the address type.
+ const auto &TLI = DAG.getTargetLoweringInfo();
+ EVT AddrVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+ // The address width must be smaller or equal to the pointer representation
+ // width, so we lower ptrtoaddr as a truncate (possibly folded to a no-op).
+ N = DAG.getNode(ISD::TRUNCATE, getCurSDLoc(), AddrVT, N);
+ setValue(&I, N);
}
void SelectionDAGBuilder::visitPtrToInt(const User &I) {
@@ -6996,6 +7002,13 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
getValue(I.getArgOperand(0)),
getValue(I.getArgOperand(1)),
getValue(I.getArgOperand(2)), Flags));
+ } else if (TLI.isOperationLegalOrCustom(ISD::FMULADD, VT)) {
+ // TODO: Support splitting the vector.
+ setValue(&I, DAG.getNode(ISD::FMULADD, sdl,
+ getValue(I.getArgOperand(0)).getValueType(),
+ getValue(I.getArgOperand(0)),
+ getValue(I.getArgOperand(1)),
+ getValue(I.getArgOperand(2)), Flags));
} else {
// TODO: Intrinsic calls should have fast-math-flags.
SDValue Mul = DAG.getNode(
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index fcfbfe6..39cbfad 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -310,6 +310,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
case ISD::FMA: return "fma";
case ISD::STRICT_FMA: return "strict_fma";
case ISD::FMAD: return "fmad";
+ case ISD::FMULADD: return "fmuladd";
case ISD::FREM: return "frem";
case ISD::STRICT_FREM: return "strict_frem";
case ISD::FCOPYSIGN: return "fcopysign";
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index cc503d3..920dff9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -7676,6 +7676,7 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
break;
}
case ISD::FMA:
+ case ISD::FMULADD:
case ISD::FMAD: {
if (!Flags.hasNoSignedZeros())
break;
diff --git a/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp b/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
index 64e5cd5..95a9c3f 100644
--- a/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
+++ b/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
@@ -306,10 +306,7 @@ char &llvm::StackFrameLayoutAnalysisPassID = StackFrameLayoutAnalysisLegacy::ID;
INITIALIZE_PASS(StackFrameLayoutAnalysisLegacy, "stack-frame-layout",
"Stack Frame Layout", false, false)
-namespace llvm {
/// Returns a newly-created StackFrameLayout pass.
-MachineFunctionPass *createStackFrameLayoutAnalysisPass() {
+MachineFunctionPass *llvm::createStackFrameLayoutAnalysisPass() {
return new StackFrameLayoutAnalysisLegacy();
}
-
-} // namespace llvm
diff --git a/llvm/lib/CodeGen/StaticDataAnnotator.cpp b/llvm/lib/CodeGen/StaticDataAnnotator.cpp
index 53a9ab4..eac20120 100644
--- a/llvm/lib/CodeGen/StaticDataAnnotator.cpp
+++ b/llvm/lib/CodeGen/StaticDataAnnotator.cpp
@@ -75,22 +75,11 @@ bool StaticDataAnnotator::runOnModule(Module &M) {
bool Changed = false;
for (auto &GV : M.globals()) {
- if (GV.isDeclarationForLinker())
+ if (!llvm::memprof::IsAnnotationOK(GV))
continue;
- // The implementation below assumes prior passes don't set section prefixes,
- // and specifically do 'assign' rather than 'update'. So report error if a
- // section prefix is already set.
- if (auto maybeSectionPrefix = GV.getSectionPrefix();
- maybeSectionPrefix && !maybeSectionPrefix->empty())
- llvm::report_fatal_error("Global variable " + GV.getName() +
- " already has a section prefix " +
- *maybeSectionPrefix);
-
StringRef SectionPrefix = SDPI->getConstantSectionPrefix(&GV, PSI);
- if (SectionPrefix.empty())
- continue;
-
+ // setSectionPrefix returns true if the section prefix is updated.
Changed |= GV.setSectionPrefix(SectionPrefix);
}
diff --git a/llvm/lib/CodeGen/StaticDataSplitter.cpp b/llvm/lib/CodeGen/StaticDataSplitter.cpp
index e22dc25..1593a40 100644
--- a/llvm/lib/CodeGen/StaticDataSplitter.cpp
+++ b/llvm/lib/CodeGen/StaticDataSplitter.cpp
@@ -130,10 +130,8 @@ StaticDataSplitter::getConstant(const MachineOperand &Op,
if (Op.isGlobal()) {
// Find global variables with local linkage.
const GlobalVariable *GV = getLocalLinkageGlobalVariable(Op.getGlobal());
- // Skip 'llvm.'-prefixed global variables conservatively because they are
- // often handled specially, and skip those not in static data
- // sections.
- if (!GV || GV->getName().starts_with("llvm.") ||
+ // Skip those not eligible for annotation or not in static data sections.
+ if (!GV || !llvm::memprof::IsAnnotationOK(*GV) ||
!inStaticDataSection(*GV, TM))
return nullptr;
return GV;
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index c23281a..060b1dd 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -815,7 +815,8 @@ void TargetLoweringBase::initActions() {
ISD::FTAN, ISD::FACOS,
ISD::FASIN, ISD::FATAN,
ISD::FCOSH, ISD::FSINH,
- ISD::FTANH, ISD::FATAN2},
+ ISD::FTANH, ISD::FATAN2,
+ ISD::FMULADD},
VT, Expand);
// Overflow operations default to expand
diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
index c9e4618..971f822 100644
--- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
@@ -102,10 +102,8 @@ bool TargetRegisterInfo::checkAllSuperRegsMarked(const BitVector &RegisterSet,
return true;
}
-namespace llvm {
-
-Printable printReg(Register Reg, const TargetRegisterInfo *TRI,
- unsigned SubIdx, const MachineRegisterInfo *MRI) {
+Printable llvm::printReg(Register Reg, const TargetRegisterInfo *TRI,
+ unsigned SubIdx, const MachineRegisterInfo *MRI) {
return Printable([Reg, TRI, SubIdx, MRI](raw_ostream &OS) {
if (!Reg)
OS << "$noreg";
@@ -135,7 +133,7 @@ Printable printReg(Register Reg, const TargetRegisterInfo *TRI,
});
}
-Printable printRegUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
+Printable llvm::printRegUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
return Printable([Unit, TRI](raw_ostream &OS) {
// Generic printout when TRI is missing.
if (!TRI) {
@@ -158,7 +156,7 @@ Printable printRegUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
});
}
-Printable printVRegOrUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
+Printable llvm::printVRegOrUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
return Printable([Unit, TRI](raw_ostream &OS) {
if (Register::isVirtualRegister(Unit)) {
OS << '%' << Register(Unit).virtRegIndex();
@@ -168,8 +166,9 @@ Printable printVRegOrUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
});
}
-Printable printRegClassOrBank(Register Reg, const MachineRegisterInfo &RegInfo,
- const TargetRegisterInfo *TRI) {
+Printable llvm::printRegClassOrBank(Register Reg,
+ const MachineRegisterInfo &RegInfo,
+ const TargetRegisterInfo *TRI) {
return Printable([Reg, &RegInfo, TRI](raw_ostream &OS) {
if (RegInfo.getRegClassOrNull(Reg))
OS << StringRef(TRI->getRegClassName(RegInfo.getRegClass(Reg))).lower();
@@ -183,8 +182,6 @@ Printable printRegClassOrBank(Register Reg, const MachineRegisterInfo &RegInfo,
});
}
-} // end namespace llvm
-
/// getAllocatableClass - Return the maximal subclass of the given register
/// class that is alloctable, or NULL.
const TargetRegisterClass *
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
index 6610eef..c61f757 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
@@ -181,8 +181,8 @@ DWARFDebugFrame::DWARFDebugFrame(Triple::ArchType Arch,
DWARFDebugFrame::~DWARFDebugFrame() = default;
-static void LLVM_ATTRIBUTE_UNUSED dumpDataAux(DataExtractor Data,
- uint64_t Offset, int Length) {
+[[maybe_unused]] static void dumpDataAux(DataExtractor Data, uint64_t Offset,
+ int Length) {
errs() << "DUMP: ";
for (int i = 0; i < Length; ++i) {
uint8_t c = Data.getU8(&Offset);
diff --git a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
index 7a0256f..fa39603 100644
--- a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
+++ b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
@@ -338,9 +338,13 @@ static void convertFunctionLineTable(OutputAggregator &Out, CUInfo &CUI,
if (FilePath.empty()) {
// If we had a DW_AT_decl_file, but got no file then we need to emit a
// warning.
+ const uint64_t DwarfFileIdx = dwarf::toUnsigned(
+ Die.findRecursively(dwarf::DW_AT_decl_file), UINT32_MAX);
+ // Check if there is no DW_AT_decl_line attribute, and don't report an
+ // error if it isn't there.
+ if (DwarfFileIdx == UINT32_MAX)
+ return;
Out.Report("Invalid file index in DW_AT_decl_file", [&](raw_ostream &OS) {
- const uint64_t DwarfFileIdx = dwarf::toUnsigned(
- Die.findRecursively(dwarf::DW_AT_decl_file), UINT32_MAX);
OS << "error: function DIE at " << HEX32(Die.getOffset())
<< " has an invalid file index " << DwarfFileIdx
<< " in its DW_AT_decl_file attribute, unable to create a single "
@@ -629,6 +633,10 @@ Error DwarfTransformer::convert(uint32_t NumThreads, OutputAggregator &Out) {
size_t NumBefore = Gsym.getNumFunctionInfos();
auto getDie = [&](DWARFUnit &DwarfUnit) -> DWARFDie {
DWARFDie ReturnDie = DwarfUnit.getUnitDIE(false);
+ // Apple uses DW_AT_GNU_dwo_id for things other than split DWARF.
+ if (IsMachO)
+ return ReturnDie;
+
if (DwarfUnit.getDWOId()) {
DWARFUnit *DWOCU = DwarfUnit.getNonSkeletonUnitDIE(false).getDwarfUnit();
if (!DWOCU->isDWOUnit())
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp
index 5b3c05e..6c7e27e 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp
@@ -260,22 +260,17 @@ public:
}
// Run finalization actions.
- using WrapperFunctionCall = orc::shared::WrapperFunctionCall;
- runFinalizeActions(
- G->allocActions(),
- [this, OnFinalized = std::move(OnFinalized)](
- Expected<std::vector<WrapperFunctionCall>> DeallocActions) mutable {
- completeFinalization(std::move(OnFinalized),
- std::move(DeallocActions));
- });
- }
+ auto DeallocActions = runFinalizeActions(G->allocActions());
+ if (!DeallocActions) {
+ OnFinalized(DeallocActions.takeError());
+ return;
+ }
- void abandon(OnAbandonedFunction OnAbandoned) override {
- Error Err = Error::success();
- if (auto EC = sys::Memory::releaseMappedMemory(FinalizationSegments))
- Err = joinErrors(std::move(Err), errorCodeToError(EC));
- if (auto EC = sys::Memory::releaseMappedMemory(StandardSegments))
- Err = joinErrors(std::move(Err), errorCodeToError(EC));
+ // Release the finalize segments slab.
+ if (auto EC = sys::Memory::releaseMappedMemory(FinalizationSegments)) {
+ OnFinalized(errorCodeToError(EC));
+ return;
+ }
#ifndef NDEBUG
// Set 'G' to null to flag that we've been successfully finalized.
@@ -284,22 +279,17 @@ public:
G = nullptr;
#endif
- OnAbandoned(std::move(Err));
+ // Continue with finalized allocation.
+ OnFinalized(MemMgr.createFinalizedAlloc(std::move(StandardSegments),
+ std::move(*DeallocActions)));
}
-private:
- void completeFinalization(
- OnFinalizedFunction OnFinalized,
- Expected<std::vector<orc::shared::WrapperFunctionCall>> DeallocActions) {
-
- if (!DeallocActions)
- return OnFinalized(DeallocActions.takeError());
-
- // Release the finalize segments slab.
- if (auto EC = sys::Memory::releaseMappedMemory(FinalizationSegments)) {
- OnFinalized(errorCodeToError(EC));
- return;
- }
+ void abandon(OnAbandonedFunction OnAbandoned) override {
+ Error Err = Error::success();
+ if (auto EC = sys::Memory::releaseMappedMemory(FinalizationSegments))
+ Err = joinErrors(std::move(Err), errorCodeToError(EC));
+ if (auto EC = sys::Memory::releaseMappedMemory(StandardSegments))
+ Err = joinErrors(std::move(Err), errorCodeToError(EC));
#ifndef NDEBUG
// Set 'G' to null to flag that we've been successfully finalized.
@@ -308,11 +298,10 @@ private:
G = nullptr;
#endif
- // Continue with finalized allocation.
- OnFinalized(MemMgr.createFinalizedAlloc(std::move(StandardSegments),
- std::move(*DeallocActions)));
+ OnAbandoned(std::move(Err));
}
+private:
Error applyProtections() {
for (auto &KV : BL.segments()) {
const auto &AG = KV.first;
diff --git a/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt b/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt
index 0ffe3ae..f343925 100644
--- a/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt
+++ b/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt
@@ -56,6 +56,7 @@ add_llvm_component_library(LLVMOrcJIT
SectCreate.cpp
SelfExecutorProcessControl.cpp
SimpleRemoteEPC.cpp
+ SimpleRemoteMemoryMapper.cpp
Speculation.cpp
SpeculateAnalyses.cpp
ExecutorProcessControl.cpp
diff --git a/llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp
index 50e6b25..0833af7 100644
--- a/llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp
@@ -57,16 +57,17 @@ public:
std::swap(FR.Actions, G.allocActions());
Parent.EPC.callSPSWrapperAsync<
- rt::SPSSimpleExecutorMemoryManagerFinalizeSignature>(
- Parent.SAs.Finalize,
+ rt::SPSSimpleExecutorMemoryManagerInitializeSignature>(
+ Parent.SAs.Initialize,
[OnFinalize = std::move(OnFinalize), AllocAddr = this->AllocAddr](
- Error SerializationErr, Error FinalizeErr) mutable {
+ Error SerializationErr,
+ Expected<ExecutorAddr> InitializeKey) mutable {
// FIXME: Release abandoned alloc.
if (SerializationErr) {
- cantFail(std::move(FinalizeErr));
+ cantFail(InitializeKey.takeError());
OnFinalize(std::move(SerializationErr));
- } else if (FinalizeErr)
- OnFinalize(std::move(FinalizeErr));
+ } else if (!InitializeKey)
+ OnFinalize(InitializeKey.takeError());
else
OnFinalize(FinalizedAlloc(AllocAddr));
},
@@ -76,8 +77,8 @@ public:
void abandon(OnAbandonedFunction OnAbandoned) override {
// FIXME: Return memory to pool instead.
Parent.EPC.callSPSWrapperAsync<
- rt::SPSSimpleExecutorMemoryManagerDeallocateSignature>(
- Parent.SAs.Deallocate,
+ rt::SPSSimpleExecutorMemoryManagerReleaseSignature>(
+ Parent.SAs.Release,
[OnAbandoned = std::move(OnAbandoned)](Error SerializationErr,
Error DeallocateErr) mutable {
if (SerializationErr) {
@@ -123,9 +124,8 @@ void EPCGenericJITLinkMemoryManager::allocate(const JITLinkDylib *JD,
void EPCGenericJITLinkMemoryManager::deallocate(
std::vector<FinalizedAlloc> Allocs, OnDeallocatedFunction OnDeallocated) {
- EPC.callSPSWrapperAsync<
- rt::SPSSimpleExecutorMemoryManagerDeallocateSignature>(
- SAs.Deallocate,
+ EPC.callSPSWrapperAsync<rt::SPSSimpleExecutorMemoryManagerReleaseSignature>(
+ SAs.Release,
[OnDeallocated = std::move(OnDeallocated)](Error SerErr,
Error DeallocErr) mutable {
if (SerErr) {
diff --git a/llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp
index fec7062..cc72488 100644
--- a/llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp
@@ -25,9 +25,9 @@ EPCGenericRTDyldMemoryManager::CreateWithDefaultBootstrapSymbols(
if (auto Err = EPC.getBootstrapSymbols(
{{SAs.Instance, rt::SimpleExecutorMemoryManagerInstanceName},
{SAs.Reserve, rt::SimpleExecutorMemoryManagerReserveWrapperName},
- {SAs.Finalize, rt::SimpleExecutorMemoryManagerFinalizeWrapperName},
- {SAs.Deallocate,
- rt::SimpleExecutorMemoryManagerDeallocateWrapperName},
+ {SAs.Initialize,
+ rt::SimpleExecutorMemoryManagerInitializeWrapperName},
+ {SAs.Release, rt::SimpleExecutorMemoryManagerReleaseWrapperName},
{SAs.RegisterEHFrame, rt::RegisterEHFrameSectionAllocActionName},
{SAs.DeregisterEHFrame,
rt::DeregisterEHFrameSectionAllocActionName}}))
@@ -48,7 +48,7 @@ EPCGenericRTDyldMemoryManager::~EPCGenericRTDyldMemoryManager() {
Error Err = Error::success();
if (auto Err2 = EPC.callSPSWrapper<
- rt::SPSSimpleExecutorMemoryManagerDeallocateSignature>(
+ rt::SPSSimpleExecutorMemoryManagerReleaseSignature>(
SAs.Reserve, Err, SAs.Instance, FinalizedAllocs)) {
// FIXME: Report errors through EPC once that functionality is available.
logAllUnhandledErrors(std::move(Err2), errs(), "");
@@ -267,10 +267,10 @@ bool EPCGenericRTDyldMemoryManager::finalizeMemory(std::string *ErrMsg) {
// We'll also need to make an extra allocation for the eh-frame wrapper call
// arguments.
- Error FinalizeErr = Error::success();
+ Expected<ExecutorAddr> InitializeKey((ExecutorAddr()));
if (auto Err = EPC.callSPSWrapper<
- rt::SPSSimpleExecutorMemoryManagerFinalizeSignature>(
- SAs.Finalize, FinalizeErr, SAs.Instance, std::move(FR))) {
+ rt::SPSSimpleExecutorMemoryManagerInitializeSignature>(
+ SAs.Initialize, InitializeKey, SAs.Instance, std::move(FR))) {
std::lock_guard<std::mutex> Lock(M);
this->ErrMsg = toString(std::move(Err));
dbgs() << "Serialization error: " << this->ErrMsg << "\n";
@@ -278,9 +278,9 @@ bool EPCGenericRTDyldMemoryManager::finalizeMemory(std::string *ErrMsg) {
*ErrMsg = this->ErrMsg;
return true;
}
- if (FinalizeErr) {
+ if (!InitializeKey) {
std::lock_guard<std::mutex> Lock(M);
- this->ErrMsg = toString(std::move(FinalizeErr));
+ this->ErrMsg = toString(InitializeKey.takeError());
dbgs() << "Finalization error: " << this->ErrMsg << "\n";
if (ErrMsg)
*ErrMsg = this->ErrMsg;
diff --git a/llvm/lib/ExecutionEngine/Orc/MapperJITLinkMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/MapperJITLinkMemoryManager.cpp
index 33734b8..bb8d2cb 100644
--- a/llvm/lib/ExecutionEngine/Orc/MapperJITLinkMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MapperJITLinkMemoryManager.cpp
@@ -90,7 +90,7 @@ void MapperJITLinkMemoryManager::allocate(const JITLinkDylib *JD, LinkGraph &G,
auto TotalSize = Seg.ContentSize + Seg.ZeroFillSize;
Seg.Addr = NextSegAddr;
- Seg.WorkingMem = Mapper->prepare(NextSegAddr, TotalSize);
+ Seg.WorkingMem = Mapper->prepare(G, NextSegAddr, TotalSize);
NextSegAddr += alignTo(TotalSize, Mapper->getPageSize());
diff --git a/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp b/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp
index ea3b22a..7e606c6a 100644
--- a/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp
@@ -58,7 +58,8 @@ void InProcessMemoryMapper::reserve(size_t NumBytes,
ExecutorAddrRange(ExecutorAddr::fromPtr(MB.base()), MB.allocatedSize()));
}
-char *InProcessMemoryMapper::prepare(ExecutorAddr Addr, size_t ContentSize) {
+char *InProcessMemoryMapper::prepare(jitlink::LinkGraph &G, ExecutorAddr Addr,
+ size_t ContentSize) {
return Addr.toPtr<char *>();
}
@@ -90,19 +91,9 @@ void InProcessMemoryMapper::initialize(MemoryMapper::AllocInfo &AI,
sys::Memory::InvalidateInstructionCache(Base.toPtr<void *>(), Size);
}
- std::vector<shared::WrapperFunctionCall> DeinitializeActions;
- {
- std::promise<MSVCPExpected<std::vector<shared::WrapperFunctionCall>>> P;
- auto F = P.get_future();
- shared::runFinalizeActions(
- AI.Actions, [&](Expected<std::vector<shared::WrapperFunctionCall>> R) {
- P.set_value(std::move(R));
- });
- if (auto DeinitializeActionsOrErr = F.get())
- DeinitializeActions = std::move(*DeinitializeActionsOrErr);
- else
- return OnInitialized(DeinitializeActionsOrErr.takeError());
- }
+ auto DeinitializeActions = shared::runFinalizeActions(AI.Actions);
+ if (!DeinitializeActions)
+ return OnInitialized(DeinitializeActions.takeError());
{
std::lock_guard<std::mutex> Lock(Mutex);
@@ -110,7 +101,7 @@ void InProcessMemoryMapper::initialize(MemoryMapper::AllocInfo &AI,
// This is the maximum range whose permission have been possibly modified
auto &Alloc = Allocations[MinAddr];
Alloc.Size = MaxAddr - MinAddr;
- Alloc.DeinitializationActions = std::move(DeinitializeActions);
+ Alloc.DeinitializationActions = std::move(*DeinitializeActions);
Reservations[AI.MappingBase.toPtr<void *>()].Allocations.push_back(MinAddr);
}
@@ -127,10 +118,10 @@ void InProcessMemoryMapper::deinitialize(
for (auto Base : llvm::reverse(Bases)) {
- shared::runDeallocActions(
- Allocations[Base].DeinitializationActions, [&](Error Err) {
- AllErr = joinErrors(std::move(AllErr), std::move(Err));
- });
+ if (Error Err = shared::runDeallocActions(
+ Allocations[Base].DeinitializationActions)) {
+ AllErr = joinErrors(std::move(AllErr), std::move(Err));
+ }
// Reset protections to read/write so the area can be reused
if (auto EC = sys::Memory::protectMappedMemory(
@@ -324,7 +315,8 @@ void SharedMemoryMapper::reserve(size_t NumBytes,
#endif
}
-char *SharedMemoryMapper::prepare(ExecutorAddr Addr, size_t ContentSize) {
+char *SharedMemoryMapper::prepare(jitlink::LinkGraph &G, ExecutorAddr Addr,
+ size_t ContentSize) {
auto R = Reservations.upper_bound(Addr);
assert(R != Reservations.begin() && "Attempt to prepare unreserved range");
R--;
diff --git a/llvm/lib/ExecutionEngine/Orc/Shared/AllocationActions.cpp b/llvm/lib/ExecutionEngine/Orc/Shared/AllocationActions.cpp
index 08ab0c6..91f2899 100644
--- a/llvm/lib/ExecutionEngine/Orc/Shared/AllocationActions.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Shared/AllocationActions.cpp
@@ -12,39 +12,31 @@ namespace llvm {
namespace orc {
namespace shared {
-void runFinalizeActions(AllocActions &AAs,
- OnRunFinalizeActionsCompleteFn OnComplete) {
+Expected<std::vector<WrapperFunctionCall>>
+runFinalizeActions(AllocActions &AAs) {
std::vector<WrapperFunctionCall> DeallocActions;
DeallocActions.reserve(numDeallocActions(AAs));
for (auto &AA : AAs) {
if (AA.Finalize)
-
- if (auto Err = AA.Finalize.runWithSPSRetErrorMerged()) {
- while (!DeallocActions.empty()) {
- Err = joinErrors(std::move(Err),
- DeallocActions.back().runWithSPSRetErrorMerged());
- DeallocActions.pop_back();
- }
- return OnComplete(std::move(Err));
- }
+ if (auto Err = AA.Finalize.runWithSPSRetErrorMerged())
+ return joinErrors(std::move(Err), runDeallocActions(DeallocActions));
if (AA.Dealloc)
DeallocActions.push_back(std::move(AA.Dealloc));
}
AAs.clear();
- OnComplete(std::move(DeallocActions));
+ return DeallocActions;
}
-void runDeallocActions(ArrayRef<WrapperFunctionCall> DAs,
- OnRunDeallocActionsComeleteFn OnComplete) {
+Error runDeallocActions(ArrayRef<WrapperFunctionCall> DAs) {
Error Err = Error::success();
while (!DAs.empty()) {
Err = joinErrors(std::move(Err), DAs.back().runWithSPSRetErrorMerged());
DAs = DAs.drop_back();
}
- OnComplete(std::move(Err));
+ return Err;
}
} // namespace shared
diff --git a/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp b/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp
index 26e8f53..cc99d3c 100644
--- a/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp
@@ -23,10 +23,12 @@ const char *SimpleExecutorMemoryManagerInstanceName =
"__llvm_orc_SimpleExecutorMemoryManager_Instance";
const char *SimpleExecutorMemoryManagerReserveWrapperName =
"__llvm_orc_SimpleExecutorMemoryManager_reserve_wrapper";
-const char *SimpleExecutorMemoryManagerFinalizeWrapperName =
- "__llvm_orc_SimpleExecutorMemoryManager_finalize_wrapper";
-const char *SimpleExecutorMemoryManagerDeallocateWrapperName =
- "__llvm_orc_SimpleExecutorMemoryManager_deallocate_wrapper";
+const char *SimpleExecutorMemoryManagerInitializeWrapperName =
+ "__llvm_orc_SimpleExecutorMemoryManager_initialize_wrapper";
+const char *SimpleExecutorMemoryManagerDeinitializeWrapperName =
+ "__llvm_orc_SimpleExecutorMemoryManager_deinitialize_wrapper";
+const char *SimpleExecutorMemoryManagerReleaseWrapperName =
+ "__llvm_orc_SimpleExecutorMemoryManager_release_wrapper";
const char *ExecutorSharedMemoryMapperServiceInstanceName =
"__llvm_orc_ExecutorSharedMemoryMapperService_Instance";
diff --git a/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp
index 87d7578..dec1df7 100644
--- a/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp
@@ -216,9 +216,9 @@ SimpleRemoteEPC::createDefaultMemoryManager(SimpleRemoteEPC &SREPC) {
if (auto Err = SREPC.getBootstrapSymbols(
{{SAs.Allocator, rt::SimpleExecutorMemoryManagerInstanceName},
{SAs.Reserve, rt::SimpleExecutorMemoryManagerReserveWrapperName},
- {SAs.Finalize, rt::SimpleExecutorMemoryManagerFinalizeWrapperName},
- {SAs.Deallocate,
- rt::SimpleExecutorMemoryManagerDeallocateWrapperName}}))
+ {SAs.Initialize,
+ rt::SimpleExecutorMemoryManagerInitializeWrapperName},
+ {SAs.Release, rt::SimpleExecutorMemoryManagerReleaseWrapperName}}))
return std::move(Err);
return std::make_unique<EPCGenericJITLinkMemoryManager>(SREPC, SAs);
diff --git a/llvm/lib/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.cpp b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.cpp
new file mode 100644
index 0000000..b82de3f
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.cpp
@@ -0,0 +1,104 @@
+//===---- SimpleRemoteMemoryMapper.cpp - Remote memory mapper ----*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.h"
+
+#include "llvm/ExecutionEngine/JITLink/JITLink.h"
+#include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
+
+namespace llvm::orc {
+
+SimpleRemoteMemoryMapper::SimpleRemoteMemoryMapper(ExecutorProcessControl &EPC,
+ SymbolAddrs SAs)
+ : EPC(EPC), SAs(SAs) {}
+
+void SimpleRemoteMemoryMapper::reserve(size_t NumBytes,
+ OnReservedFunction OnReserved) {
+ EPC.callSPSWrapperAsync<rt::SPSSimpleRemoteMemoryMapReserveSignature>(
+ SAs.Reserve,
+ [NumBytes, OnReserved = std::move(OnReserved)](
+ Error SerializationErr, Expected<ExecutorAddr> Result) mutable {
+ if (SerializationErr) {
+ cantFail(Result.takeError());
+ return OnReserved(std::move(SerializationErr));
+ }
+
+ if (Result)
+ OnReserved(ExecutorAddrRange(*Result, NumBytes));
+ else
+ OnReserved(Result.takeError());
+ },
+ SAs.Instance, static_cast<uint64_t>(NumBytes));
+}
+
+char *SimpleRemoteMemoryMapper::prepare(jitlink::LinkGraph &G,
+ ExecutorAddr Addr, size_t ContentSize) {
+ return G.allocateBuffer(ContentSize).data();
+}
+
+void SimpleRemoteMemoryMapper::initialize(MemoryMapper::AllocInfo &AI,
+ OnInitializedFunction OnInitialized) {
+
+ tpctypes::FinalizeRequest FR;
+
+ std::swap(FR.Actions, AI.Actions);
+ FR.Segments.reserve(AI.Segments.size());
+
+ for (auto Seg : AI.Segments)
+ FR.Segments.push_back({Seg.AG, AI.MappingBase + Seg.Offset,
+ Seg.ContentSize + Seg.ZeroFillSize,
+ ArrayRef<char>(Seg.WorkingMem, Seg.ContentSize)});
+
+ EPC.callSPSWrapperAsync<rt::SPSSimpleRemoteMemoryMapInitializeSignature>(
+ SAs.Initialize,
+ [OnInitialized = std::move(OnInitialized)](
+ Error SerializationErr, Expected<ExecutorAddr> Result) mutable {
+ if (SerializationErr) {
+ cantFail(Result.takeError());
+ return OnInitialized(std::move(SerializationErr));
+ }
+
+ OnInitialized(std::move(Result));
+ },
+ SAs.Instance, std::move(FR));
+}
+
+void SimpleRemoteMemoryMapper::deinitialize(
+ ArrayRef<ExecutorAddr> Allocations,
+ MemoryMapper::OnDeinitializedFunction OnDeinitialized) {
+ EPC.callSPSWrapperAsync<rt::SPSSimpleRemoteMemoryMapDeinitializeSignature>(
+ SAs.Deinitialize,
+ [OnDeinitialized = std::move(OnDeinitialized)](Error SerializationErr,
+ Error Result) mutable {
+ if (SerializationErr) {
+ cantFail(std::move(Result));
+ return OnDeinitialized(std::move(SerializationErr));
+ }
+
+ OnDeinitialized(std::move(Result));
+ },
+ SAs.Instance, Allocations);
+}
+
+void SimpleRemoteMemoryMapper::release(ArrayRef<ExecutorAddr> Bases,
+ OnReleasedFunction OnReleased) {
+ EPC.callSPSWrapperAsync<rt::SPSSimpleRemoteMemoryMapReleaseSignature>(
+ SAs.Release,
+ [OnReleased = std::move(OnReleased)](Error SerializationErr,
+ Error Result) mutable {
+ if (SerializationErr) {
+ cantFail(std::move(Result));
+ return OnReleased(std::move(SerializationErr));
+ }
+
+ return OnReleased(std::move(Result));
+ },
+ SAs.Instance, Bases);
+}
+
+} // namespace llvm::orc
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.cpp
index 8c24b1f..4fbf232 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.cpp
@@ -9,10 +9,8 @@
#include "llvm/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.h"
#include "llvm/Config/llvm-config.h" // for LLVM_ON_UNIX
#include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
-#include "llvm/Support/MSVCErrorWorkarounds.h"
#include "llvm/Support/Process.h"
#include "llvm/Support/WindowsError.h"
-#include <future>
#include <sstream>
#if defined(LLVM_ON_UNIX)
@@ -183,24 +181,15 @@ Expected<ExecutorAddr> ExecutorSharedMemoryMapperService::initialize(
}
// Run finalization actions and get deinitlization action list.
- std::vector<shared::WrapperFunctionCall> DeinitializeActions;
- {
- std::promise<MSVCPExpected<std::vector<shared::WrapperFunctionCall>>> P;
- auto F = P.get_future();
- shared::runFinalizeActions(
- FR.Actions, [&](Expected<std::vector<shared::WrapperFunctionCall>> R) {
- P.set_value(std::move(R));
- });
- if (auto DeinitializeActionsOrErr = F.get())
- DeinitializeActions = std::move(*DeinitializeActionsOrErr);
- else
- return DeinitializeActionsOrErr.takeError();
+ auto DeinitializeActions = shared::runFinalizeActions(FR.Actions);
+ if (!DeinitializeActions) {
+ return DeinitializeActions.takeError();
}
{
std::lock_guard<std::mutex> Lock(Mutex);
Allocations[MinAddr].DeinitializationActions =
- std::move(DeinitializeActions);
+ std::move(*DeinitializeActions);
Reservations[Reservation.toPtr<void *>()].Allocations.push_back(MinAddr);
}
@@ -221,11 +210,10 @@ Error ExecutorSharedMemoryMapperService::deinitialize(
std::lock_guard<std::mutex> Lock(Mutex);
for (auto Base : llvm::reverse(Bases)) {
- shared::runDeallocActions(
- Allocations[Base].DeinitializationActions, [&](Error Err) {
- if (Err)
- AllErr = joinErrors(std::move(AllErr), std::move(Err));
- });
+ if (Error Err = shared::runDeallocActions(
+ Allocations[Base].DeinitializationActions)) {
+ AllErr = joinErrors(std::move(AllErr), std::move(Err));
+ }
// Remove the allocation from the allocation list of its reservation
for (auto &Reservation : Reservations) {
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp
index 3cdffb8..fe881a1 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp
@@ -8,6 +8,7 @@
#include "llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.h"
+#include "llvm/ADT/ScopeExit.h"
#include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
#include "llvm/Support/FormatVariadic.h"
@@ -18,166 +19,167 @@ namespace orc {
namespace rt_bootstrap {
SimpleExecutorMemoryManager::~SimpleExecutorMemoryManager() {
- assert(Allocations.empty() && "shutdown not called?");
+ assert(Slabs.empty() && "shutdown not called?");
}
-Expected<ExecutorAddr> SimpleExecutorMemoryManager::allocate(uint64_t Size) {
+Expected<ExecutorAddr> SimpleExecutorMemoryManager::reserve(uint64_t Size) {
std::error_code EC;
auto MB = sys::Memory::allocateMappedMemory(
Size, nullptr, sys::Memory::MF_READ | sys::Memory::MF_WRITE, EC);
if (EC)
return errorCodeToError(EC);
std::lock_guard<std::mutex> Lock(M);
- assert(!Allocations.count(MB.base()) && "Duplicate allocation addr");
- Allocations[MB.base()].Size = Size;
+ assert(!Slabs.count(MB.base()) && "Duplicate allocation addr");
+ Slabs[MB.base()].Size = Size;
return ExecutorAddr::fromPtr(MB.base());
}
-Error SimpleExecutorMemoryManager::finalize(tpctypes::FinalizeRequest &FR) {
- ExecutorAddr Base(~0ULL);
+Expected<ExecutorAddr>
+SimpleExecutorMemoryManager::initialize(tpctypes::FinalizeRequest &FR) {
std::vector<shared::WrapperFunctionCall> DeallocationActions;
- size_t SuccessfulFinalizationActions = 0;
if (FR.Segments.empty()) {
- // NOTE: Finalizing nothing is currently a no-op. Should it be an error?
if (FR.Actions.empty())
- return Error::success();
+ return make_error<StringError>("Finalization request is empty",
+ inconvertibleErrorCode());
else
return make_error<StringError>("Finalization actions attached to empty "
"finalization request",
inconvertibleErrorCode());
}
- for (auto &Seg : FR.Segments)
- Base = std::min(Base, Seg.Addr);
-
- for (auto &ActPair : FR.Actions)
- if (ActPair.Dealloc)
- DeallocationActions.push_back(ActPair.Dealloc);
-
- // Get the Allocation for this finalization.
- size_t AllocSize = 0;
- {
- std::lock_guard<std::mutex> Lock(M);
- auto I = Allocations.find(Base.toPtr<void *>());
- if (I == Allocations.end())
- return make_error<StringError>("Attempt to finalize unrecognized "
- "allocation " +
- formatv("{0:x}", Base.getValue()),
- inconvertibleErrorCode());
- AllocSize = I->second.Size;
- I->second.DeallocationActions = std::move(DeallocationActions);
- }
- ExecutorAddr AllocEnd = Base + ExecutorAddrDiff(AllocSize);
-
- // Bail-out function: this will run deallocation actions corresponding to any
- // completed finalization actions, then deallocate memory.
- auto BailOut = [&](Error Err) {
- std::pair<void *, Allocation> AllocToDestroy;
-
- // Get allocation to destroy.
- {
- std::lock_guard<std::mutex> Lock(M);
- auto I = Allocations.find(Base.toPtr<void *>());
-
- // Check for missing allocation (effective a double free).
- if (I == Allocations.end())
- return joinErrors(
- std::move(Err),
- make_error<StringError>("No allocation entry found "
- "for " +
- formatv("{0:x}", Base.getValue()),
- inconvertibleErrorCode()));
- AllocToDestroy = std::move(*I);
- Allocations.erase(I);
- }
+ ExecutorAddrRange RR(FR.Segments.front().Addr, FR.Segments.front().Addr);
- // Run deallocation actions for all completed finalization actions.
- while (SuccessfulFinalizationActions)
- Err =
- joinErrors(std::move(Err), FR.Actions[--SuccessfulFinalizationActions]
- .Dealloc.runWithSPSRetErrorMerged());
-
- // Deallocate memory.
- sys::MemoryBlock MB(AllocToDestroy.first, AllocToDestroy.second.Size);
- if (auto EC = sys::Memory::releaseMappedMemory(MB))
- Err = joinErrors(std::move(Err), errorCodeToError(EC));
-
- return Err;
- };
+ std::vector<sys::MemoryBlock> MBsToReset;
+ auto ResetMBs = make_scope_exit([&]() {
+ for (auto &MB : MBsToReset)
+ sys::Memory::protectMappedMemory(MB, sys::Memory::MF_READ |
+ sys::Memory::MF_WRITE);
+ sys::Memory::InvalidateInstructionCache(RR.Start.toPtr<void *>(),
+ RR.size());
+ });
// Copy content and apply permissions.
for (auto &Seg : FR.Segments) {
+ RR.Start = std::min(RR.Start, Seg.Addr);
+ RR.End = std::max(RR.End, Seg.Addr + Seg.Size);
// Check segment ranges.
if (LLVM_UNLIKELY(Seg.Size < Seg.Content.size()))
- return BailOut(make_error<StringError>(
+ return make_error<StringError>(
formatv("Segment {0:x} content size ({1:x} bytes) "
"exceeds segment size ({2:x} bytes)",
Seg.Addr.getValue(), Seg.Content.size(), Seg.Size),
- inconvertibleErrorCode()));
+ inconvertibleErrorCode());
ExecutorAddr SegEnd = Seg.Addr + ExecutorAddrDiff(Seg.Size);
- if (LLVM_UNLIKELY(Seg.Addr < Base || SegEnd > AllocEnd))
- return BailOut(make_error<StringError>(
+ if (LLVM_UNLIKELY(Seg.Addr < RR.Start || SegEnd > RR.End))
+ return make_error<StringError>(
formatv("Segment {0:x} -- {1:x} crosses boundary of "
"allocation {2:x} -- {3:x}",
- Seg.Addr.getValue(), SegEnd.getValue(), Base.getValue(),
- AllocEnd.getValue()),
- inconvertibleErrorCode()));
+ Seg.Addr, SegEnd, RR.Start, RR.End),
+ inconvertibleErrorCode());
char *Mem = Seg.Addr.toPtr<char *>();
if (!Seg.Content.empty())
memcpy(Mem, Seg.Content.data(), Seg.Content.size());
memset(Mem + Seg.Content.size(), 0, Seg.Size - Seg.Content.size());
assert(Seg.Size <= std::numeric_limits<size_t>::max());
+
+ sys::MemoryBlock MB(Mem, Seg.Size);
if (auto EC = sys::Memory::protectMappedMemory(
- {Mem, static_cast<size_t>(Seg.Size)},
- toSysMemoryProtectionFlags(Seg.RAG.Prot)))
- return BailOut(errorCodeToError(EC));
+ MB, toSysMemoryProtectionFlags(Seg.RAG.Prot)))
+ return errorCodeToError(EC);
+
+ MBsToReset.push_back(MB);
+
if ((Seg.RAG.Prot & MemProt::Exec) == MemProt::Exec)
sys::Memory::InvalidateInstructionCache(Mem, Seg.Size);
}
- // Run finalization actions.
- for (auto &ActPair : FR.Actions) {
- if (auto Err = ActPair.Finalize.runWithSPSRetErrorMerged())
- return BailOut(std::move(Err));
- ++SuccessfulFinalizationActions;
+ auto DeallocActions = runFinalizeActions(FR.Actions);
+ if (!DeallocActions)
+ return DeallocActions.takeError();
+
+ {
+ std::lock_guard<std::mutex> Lock(M);
+ auto Region = createRegionInfo(RR, "In initialize");
+ if (!Region)
+ return Region.takeError();
+ Region->DeallocActions = std::move(*DeallocActions);
}
- return Error::success();
+ // Successful initialization.
+ ResetMBs.release();
+
+ return RR.Start;
}
-Error SimpleExecutorMemoryManager::deallocate(
- const std::vector<ExecutorAddr> &Bases) {
- std::vector<std::pair<void *, Allocation>> AllocPairs;
- AllocPairs.reserve(Bases.size());
+Error SimpleExecutorMemoryManager::deinitialize(
+ const std::vector<ExecutorAddr> &InitKeys) {
+ Error Err = Error::success();
- // Get allocation to destroy.
+ for (auto &KeyAddr : llvm::reverse(InitKeys)) {
+ std::vector<shared::WrapperFunctionCall> DeallocActions;
+ {
+ std::scoped_lock<std::mutex> Lock(M);
+ auto Slab = getSlabInfo(KeyAddr, "In deinitialize");
+ if (!Slab) {
+ Err = joinErrors(std::move(Err), Slab.takeError());
+ continue;
+ }
+
+ auto RI = getRegionInfo(*Slab, KeyAddr, "In deinitialize");
+ if (!RI) {
+ Err = joinErrors(std::move(Err), RI.takeError());
+ continue;
+ }
+
+ DeallocActions = std::move(RI->DeallocActions);
+ }
+
+ Err = joinErrors(std::move(Err),
+ runDeallocActions(std::move(DeallocActions)));
+ }
+
+ return Err;
+}
+
+Error SimpleExecutorMemoryManager::release(
+ const std::vector<ExecutorAddr> &Bases) {
Error Err = Error::success();
- {
- std::lock_guard<std::mutex> Lock(M);
- for (auto &Base : Bases) {
- auto I = Allocations.find(Base.toPtr<void *>());
-
- // Check for missing allocation (effective a double free).
- if (I != Allocations.end()) {
- AllocPairs.push_back(std::move(*I));
- Allocations.erase(I);
- } else
+
+ // TODO: Prohibit new initializations within the slabs being removed?
+ for (auto &Base : llvm::reverse(Bases)) {
+ std::vector<shared::WrapperFunctionCall> DeallocActions;
+ sys::MemoryBlock MB;
+
+ {
+ std::scoped_lock<std::mutex> Lock(M);
+
+ auto SlabI = Slabs.find(Base.toPtr<void *>());
+ if (SlabI == Slabs.end()) {
Err = joinErrors(
std::move(Err),
- make_error<StringError>("No allocation entry found "
- "for " +
- formatv("{0:x}", Base.getValue()),
+ make_error<StringError>("In release, " + formatv("{0:x}", Base) +
+ " is not part of any reserved "
+ "address range",
inconvertibleErrorCode()));
+ continue;
+ }
+
+ auto &Slab = SlabI->second;
+
+ for (auto &[Addr, Region] : Slab.Regions)
+ llvm::copy(Region.DeallocActions, back_inserter(DeallocActions));
+
+ MB = {Base.toPtr<void *>(), Slab.Size};
+
+ Slabs.erase(SlabI);
}
- }
- while (!AllocPairs.empty()) {
- auto &P = AllocPairs.back();
- Err = joinErrors(std::move(Err), deallocateImpl(P.first, P.second));
- AllocPairs.pop_back();
+ Err = joinErrors(std::move(Err), runDeallocActions(DeallocActions));
+ if (auto EC = sys::Memory::releaseMappedMemory(MB))
+ Err = joinErrors(std::move(Err), errorCodeToError(EC));
}
return Err;
@@ -185,16 +187,15 @@ Error SimpleExecutorMemoryManager::deallocate(
Error SimpleExecutorMemoryManager::shutdown() {
- AllocationsMap AM;
+ // TODO: Prevent new allocations during shutdown.
+ std::vector<ExecutorAddr> Bases;
{
- std::lock_guard<std::mutex> Lock(M);
- AM = std::move(Allocations);
+ std::scoped_lock<std::mutex> Lock(M);
+ for (auto &[Base, Slab] : Slabs)
+ Bases.push_back(ExecutorAddr::fromPtr(Base));
}
- Error Err = Error::success();
- for (auto &KV : AM)
- Err = joinErrors(std::move(Err), deallocateImpl(KV.first, KV.second));
- return Err;
+ return release(Bases);
}
void SimpleExecutorMemoryManager::addBootstrapSymbols(
@@ -202,58 +203,150 @@ void SimpleExecutorMemoryManager::addBootstrapSymbols(
M[rt::SimpleExecutorMemoryManagerInstanceName] = ExecutorAddr::fromPtr(this);
M[rt::SimpleExecutorMemoryManagerReserveWrapperName] =
ExecutorAddr::fromPtr(&reserveWrapper);
- M[rt::SimpleExecutorMemoryManagerFinalizeWrapperName] =
- ExecutorAddr::fromPtr(&finalizeWrapper);
- M[rt::SimpleExecutorMemoryManagerDeallocateWrapperName] =
- ExecutorAddr::fromPtr(&deallocateWrapper);
+ M[rt::SimpleExecutorMemoryManagerInitializeWrapperName] =
+ ExecutorAddr::fromPtr(&initializeWrapper);
+ M[rt::SimpleExecutorMemoryManagerDeinitializeWrapperName] =
+ ExecutorAddr::fromPtr(&deinitializeWrapper);
+ M[rt::SimpleExecutorMemoryManagerReleaseWrapperName] =
+ ExecutorAddr::fromPtr(&releaseWrapper);
}
-Error SimpleExecutorMemoryManager::deallocateImpl(void *Base, Allocation &A) {
- Error Err = Error::success();
+Expected<SimpleExecutorMemoryManager::SlabInfo &>
+SimpleExecutorMemoryManager::getSlabInfo(ExecutorAddr A, StringRef Context) {
+ auto MakeBadSlabError = [&]() {
+ return make_error<StringError>(
+ Context + ", address " + formatv("{0:x}", A) +
+ " is not part of any reserved address range",
+ inconvertibleErrorCode());
+ };
- while (!A.DeallocationActions.empty()) {
- Err = joinErrors(std::move(Err),
- A.DeallocationActions.back().runWithSPSRetErrorMerged());
- A.DeallocationActions.pop_back();
+ auto I = Slabs.upper_bound(A.toPtr<void *>());
+ if (I == Slabs.begin())
+ return MakeBadSlabError();
+ --I;
+ if (!ExecutorAddrRange(ExecutorAddr::fromPtr(I->first), I->second.Size)
+ .contains(A))
+ return MakeBadSlabError();
+
+ return I->second;
+}
+
+Expected<SimpleExecutorMemoryManager::SlabInfo &>
+SimpleExecutorMemoryManager::getSlabInfo(ExecutorAddrRange R,
+ StringRef Context) {
+ auto MakeBadSlabError = [&]() {
+ return make_error<StringError>(
+ Context + ", range " + formatv("{0:x}", R) +
+ " is not part of any reserved address range",
+ inconvertibleErrorCode());
+ };
+
+ auto I = Slabs.upper_bound(R.Start.toPtr<void *>());
+ if (I == Slabs.begin())
+ return MakeBadSlabError();
+ --I;
+ if (!ExecutorAddrRange(ExecutorAddr::fromPtr(I->first), I->second.Size)
+ .contains(R))
+ return MakeBadSlabError();
+
+ return I->second;
+}
+
+Expected<SimpleExecutorMemoryManager::RegionInfo &>
+SimpleExecutorMemoryManager::createRegionInfo(ExecutorAddrRange R,
+ StringRef Context) {
+
+ auto Slab = getSlabInfo(R, Context);
+ if (!Slab)
+ return Slab.takeError();
+
+ auto MakeBadRegionError = [&](ExecutorAddrRange Other, bool Prev) {
+ return make_error<StringError>(Context + ", region " + formatv("{0:x}", R) +
+ " overlaps " +
+ (Prev ? "previous" : "following") +
+ " region " + formatv("{0:x}", Other),
+ inconvertibleErrorCode());
+ };
+
+ auto I = Slab->Regions.upper_bound(R.Start);
+ if (I != Slab->Regions.begin()) {
+ auto J = std::prev(I);
+ ExecutorAddrRange PrevRange(J->first, J->second.Size);
+ if (PrevRange.overlaps(R))
+ return MakeBadRegionError(PrevRange, true);
+ }
+ if (I != Slab->Regions.end()) {
+ ExecutorAddrRange NextRange(I->first, I->second.Size);
+ if (NextRange.overlaps(R))
+ return MakeBadRegionError(NextRange, false);
}
- sys::MemoryBlock MB(Base, A.Size);
- if (auto EC = sys::Memory::releaseMappedMemory(MB))
- Err = joinErrors(std::move(Err), errorCodeToError(EC));
+ auto &RInfo = Slab->Regions[R.Start];
+ RInfo.Size = R.size();
+ return RInfo;
+}
- return Err;
+Expected<SimpleExecutorMemoryManager::RegionInfo &>
+SimpleExecutorMemoryManager::getRegionInfo(SlabInfo &Slab, ExecutorAddr A,
+ StringRef Context) {
+ auto I = Slab.Regions.find(A);
+ if (I == Slab.Regions.end())
+ return make_error<StringError>(
+ Context + ", address " + formatv("{0:x}", A) +
+ " does not correspond to the start of any initialized region",
+ inconvertibleErrorCode());
+
+ return I->second;
+}
+
+Expected<SimpleExecutorMemoryManager::RegionInfo &>
+SimpleExecutorMemoryManager::getRegionInfo(ExecutorAddr A, StringRef Context) {
+ auto Slab = getSlabInfo(A, Context);
+ if (!Slab)
+ return Slab.takeError();
+
+ return getRegionInfo(*Slab, A, Context);
}
llvm::orc::shared::CWrapperFunctionResult
SimpleExecutorMemoryManager::reserveWrapper(const char *ArgData,
size_t ArgSize) {
- return shared::WrapperFunction<
- rt::SPSSimpleExecutorMemoryManagerReserveSignature>::
+ return shared::WrapperFunction<rt::SPSSimpleRemoteMemoryMapReserveSignature>::
handle(ArgData, ArgSize,
shared::makeMethodWrapperHandler(
- &SimpleExecutorMemoryManager::allocate))
+ &SimpleExecutorMemoryManager::reserve))
+ .release();
+}
+
+llvm::orc::shared::CWrapperFunctionResult
+SimpleExecutorMemoryManager::initializeWrapper(const char *ArgData,
+ size_t ArgSize) {
+ return shared::
+ WrapperFunction<rt::SPSSimpleRemoteMemoryMapInitializeSignature>::handle(
+ ArgData, ArgSize,
+ shared::makeMethodWrapperHandler(
+ &SimpleExecutorMemoryManager::initialize))
.release();
}
llvm::orc::shared::CWrapperFunctionResult
-SimpleExecutorMemoryManager::finalizeWrapper(const char *ArgData,
- size_t ArgSize) {
+SimpleExecutorMemoryManager::deinitializeWrapper(const char *ArgData,
+ size_t ArgSize) {
return shared::WrapperFunction<
- rt::SPSSimpleExecutorMemoryManagerFinalizeSignature>::
+ rt::SPSSimpleRemoteMemoryMapDeinitializeSignature>::
handle(ArgData, ArgSize,
shared::makeMethodWrapperHandler(
- &SimpleExecutorMemoryManager::finalize))
+ &SimpleExecutorMemoryManager::deinitialize))
.release();
}
llvm::orc::shared::CWrapperFunctionResult
-SimpleExecutorMemoryManager::deallocateWrapper(const char *ArgData,
- size_t ArgSize) {
- return shared::WrapperFunction<
- rt::SPSSimpleExecutorMemoryManagerDeallocateSignature>::
+SimpleExecutorMemoryManager::releaseWrapper(const char *ArgData,
+ size_t ArgSize) {
+ return shared::WrapperFunction<rt::SPSSimpleRemoteMemoryMapReleaseSignature>::
handle(ArgData, ArgSize,
shared::makeMethodWrapperHandler(
- &SimpleExecutorMemoryManager::deallocate))
+ &SimpleExecutorMemoryManager::release))
.release();
}
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 2430d98..3908a78 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -2374,16 +2374,21 @@ static void writeDICompileUnit(raw_ostream &Out, const DICompileUnit *N,
Out << "!DICompileUnit(";
MDFieldPrinter Printer(Out, WriterCtx);
- auto Lang = N->getSourceLanguage();
- if (Lang.hasVersionedName())
+ DISourceLanguageName Lang = N->getSourceLanguage();
+
+ if (Lang.hasVersionedName()) {
Printer.printDwarfEnum(
"sourceLanguageName",
static_cast<llvm::dwarf::SourceLanguageName>(Lang.getName()),
dwarf::SourceLanguageNameString,
/* ShouldSkipZero */ false);
- else
+
+ Printer.printInt("sourceLanguageVersion", Lang.getVersion(),
+ /*ShouldSkipZero=*/true);
+ } else {
Printer.printDwarfEnum("language", Lang.getName(), dwarf::LanguageString,
/* ShouldSkipZero */ false);
+ }
Printer.printMetadata("file", N->getRawFile(), /* ShouldSkipNull */ false);
Printer.printString("producer", N->getProducer());
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index f28b989..d8374b6 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -6041,8 +6041,7 @@ std::string llvm::UpgradeDataLayoutString(StringRef DL, StringRef TT) {
Triple T(TT);
// The only data layout upgrades needed for pre-GCN, SPIR or SPIRV are setting
// the address space of globals to 1. This does not apply to SPIRV Logical.
- if (((T.isAMDGPU() && !T.isAMDGCN()) ||
- (T.isSPIR() || (T.isSPIRV() && !T.isSPIRVLogical()))) &&
+ if ((T.isSPIR() || (T.isSPIRV() && !T.isSPIRVLogical())) &&
!DL.contains("-G") && !DL.starts_with("G")) {
return DL.empty() ? std::string("G1") : (DL + "-G1").str();
}
@@ -6055,35 +6054,43 @@ std::string llvm::UpgradeDataLayoutString(StringRef DL, StringRef TT) {
return DL.str();
}
+ // AMDGPU data layout upgrades.
std::string Res = DL.str();
- // AMDGCN data layout upgrades.
- if (T.isAMDGCN()) {
+ if (T.isAMDGPU()) {
// Define address spaces for constants.
if (!DL.contains("-G") && !DL.starts_with("G"))
Res.append(Res.empty() ? "G1" : "-G1");
- // Add missing non-integral declarations.
- // This goes before adding new address spaces to prevent incoherent string
- // values.
- if (!DL.contains("-ni") && !DL.starts_with("ni"))
- Res.append("-ni:7:8:9");
- // Update ni:7 to ni:7:8:9.
- if (DL.ends_with("ni:7"))
- Res.append(":8:9");
- if (DL.ends_with("ni:7:8"))
- Res.append(":9");
-
- // Add sizing for address spaces 7 and 8 (fat raw buffers and buffer
- // resources) An empty data layout has already been upgraded to G1 by now.
- if (!DL.contains("-p7") && !DL.starts_with("p7"))
- Res.append("-p7:160:256:256:32");
- if (!DL.contains("-p8") && !DL.starts_with("p8"))
- Res.append("-p8:128:128:128:48");
- constexpr StringRef OldP8("-p8:128:128-");
- if (DL.contains(OldP8))
- Res.replace(Res.find(OldP8), OldP8.size(), "-p8:128:128:128:48-");
- if (!DL.contains("-p9") && !DL.starts_with("p9"))
- Res.append("-p9:192:256:256:32");
+ // AMDGCN data layout upgrades.
+ if (T.isAMDGCN()) {
+
+ // Add missing non-integral declarations.
+ // This goes before adding new address spaces to prevent incoherent string
+ // values.
+ if (!DL.contains("-ni") && !DL.starts_with("ni"))
+ Res.append("-ni:7:8:9");
+ // Update ni:7 to ni:7:8:9.
+ if (DL.ends_with("ni:7"))
+ Res.append(":8:9");
+ if (DL.ends_with("ni:7:8"))
+ Res.append(":9");
+
+ // Add sizing for address spaces 7 and 8 (fat raw buffers and buffer
+ // resources) An empty data layout has already been upgraded to G1 by now.
+ if (!DL.contains("-p7") && !DL.starts_with("p7"))
+ Res.append("-p7:160:256:256:32");
+ if (!DL.contains("-p8") && !DL.starts_with("p8"))
+ Res.append("-p8:128:128:128:48");
+ constexpr StringRef OldP8("-p8:128:128-");
+ if (DL.contains(OldP8))
+ Res.replace(Res.find(OldP8), OldP8.size(), "-p8:128:128:128:48-");
+ if (!DL.contains("-p9") && !DL.starts_with("p9"))
+ Res.append("-p9:192:256:256:32");
+ }
+
+ // Upgrade the ELF mangling mode.
+ if (!DL.contains("m:e"))
+ Res = Res.empty() ? "m:e" : "m:e-" + Res;
return Res;
}
diff --git a/llvm/lib/IR/ConstantFPRange.cpp b/llvm/lib/IR/ConstantFPRange.cpp
index 51d2e21..5b87686 100644
--- a/llvm/lib/IR/ConstantFPRange.cpp
+++ b/llvm/lib/IR/ConstantFPRange.cpp
@@ -8,6 +8,7 @@
#include "llvm/IR/ConstantFPRange.h"
#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/FloatingPointMode.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include <cassert>
@@ -506,3 +507,168 @@ ConstantFPRange ConstantFPRange::sub(const ConstantFPRange &Other) const {
// fsub X, Y = fadd X, (fneg Y)
return add(Other.negate());
}
+
+void ConstantFPRange::flushDenormals(DenormalMode::DenormalModeKind Mode) {
+ if (Mode == DenormalMode::IEEE)
+ return;
+ FPClassTest Class = classify();
+ if (!(Class & fcSubnormal))
+ return;
+
+ auto &Sem = getSemantics();
+ // PreserveSign: PosSubnormal -> PosZero, NegSubnormal -> NegZero
+ // PositiveZero: PosSubnormal -> PosZero, NegSubnormal -> PosZero
+ // Dynamic: PosSubnormal -> PosZero, NegSubnormal -> NegZero/PosZero
+ bool ZeroLowerNegative =
+ Mode != DenormalMode::PositiveZero && (Class & fcNegSubnormal);
+ bool ZeroUpperNegative =
+ Mode == DenormalMode::PreserveSign && !(Class & fcPosSubnormal);
+ assert((ZeroLowerNegative || !ZeroUpperNegative) &&
+ "ZeroLower is greater than ZeroUpper.");
+ Lower = minnum(Lower, APFloat::getZero(Sem, ZeroLowerNegative));
+ Upper = maxnum(Upper, APFloat::getZero(Sem, ZeroUpperNegative));
+}
+
+/// Represent a contiguous range of values sharing the same sign.
+struct SameSignRange {
+ bool HasZero;
+ bool HasNonZero;
+ bool HasInf;
+ // The lower and upper bounds of the range (inclusive).
+ // The sign is dropped and infinities are excluded.
+ std::optional<std::pair<APFloat, APFloat>> FinitePart;
+
+ explicit SameSignRange(const APFloat &Lower, const APFloat &Upper)
+ : HasZero(Lower.isZero()), HasNonZero(!Upper.isZero()),
+ HasInf(Upper.isInfinity()) {
+ assert(!Lower.isNegative() && !Upper.isNegative() &&
+ "The sign should be dropped.");
+ assert(strictCompare(Lower, Upper) != APFloat::cmpGreaterThan &&
+ "Empty set.");
+ if (!Lower.isInfinity())
+ FinitePart = {Lower,
+ HasInf ? APFloat::getLargest(Lower.getSemantics()) : Upper};
+ }
+};
+
+/// Split the range into positive and negative components.
+static void splitPosNeg(const APFloat &Lower, const APFloat &Upper,
+ std::optional<SameSignRange> &NegPart,
+ std::optional<SameSignRange> &PosPart) {
+ assert(strictCompare(Lower, Upper) != APFloat::cmpGreaterThan &&
+ "Non-NaN part is empty.");
+ if (Lower.isNegative() == Upper.isNegative()) {
+ if (Lower.isNegative())
+ NegPart = SameSignRange{abs(Upper), abs(Lower)};
+ else
+ PosPart = SameSignRange{Lower, Upper};
+ return;
+ }
+ auto &Sem = Lower.getSemantics();
+ NegPart = SameSignRange{APFloat::getZero(Sem), abs(Lower)};
+ PosPart = SameSignRange{APFloat::getZero(Sem), Upper};
+}
+
+ConstantFPRange ConstantFPRange::mul(const ConstantFPRange &Other) const {
+ auto &Sem = getSemantics();
+ bool ResMayBeQNaN = ((MayBeQNaN || MayBeSNaN) && !Other.isEmptySet()) ||
+ ((Other.MayBeQNaN || Other.MayBeSNaN) && !isEmptySet());
+ if (isNaNOnly() || Other.isNaNOnly())
+ return getNaNOnly(Sem, /*MayBeQNaN=*/ResMayBeQNaN,
+ /*MayBeSNaN=*/false);
+ std::optional<SameSignRange> LHSNeg, LHSPos, RHSNeg, RHSPos;
+ splitPosNeg(Lower, Upper, LHSNeg, LHSPos);
+ splitPosNeg(Other.Lower, Other.Upper, RHSNeg, RHSPos);
+ APFloat ResLower = APFloat::getInf(Sem, /*Negative=*/false);
+ APFloat ResUpper = APFloat::getInf(Sem, /*Negative=*/true);
+ auto Update = [&](std::optional<SameSignRange> &LHS,
+ std::optional<SameSignRange> &RHS, bool Negative) {
+ if (!LHS || !RHS)
+ return;
+ // 0 * inf = QNaN
+ ResMayBeQNaN |= LHS->HasZero && RHS->HasInf;
+ ResMayBeQNaN |= RHS->HasZero && LHS->HasInf;
+ // NonZero * inf = inf
+ if ((LHS->HasInf && RHS->HasNonZero) || (RHS->HasInf && LHS->HasNonZero))
+ (Negative ? ResLower : ResUpper) = APFloat::getInf(Sem, Negative);
+ // Finite * Finite
+ if (LHS->FinitePart && RHS->FinitePart) {
+ APFloat NewLower = LHS->FinitePart->first * RHS->FinitePart->first;
+ APFloat NewUpper = LHS->FinitePart->second * RHS->FinitePart->second;
+ if (Negative) {
+ ResLower = minnum(ResLower, -NewUpper);
+ ResUpper = maxnum(ResUpper, -NewLower);
+ } else {
+ ResLower = minnum(ResLower, NewLower);
+ ResUpper = maxnum(ResUpper, NewUpper);
+ }
+ }
+ };
+ Update(LHSNeg, RHSNeg, /*Negative=*/false);
+ Update(LHSNeg, RHSPos, /*Negative=*/true);
+ Update(LHSPos, RHSNeg, /*Negative=*/true);
+ Update(LHSPos, RHSPos, /*Negative=*/false);
+ return ConstantFPRange(ResLower, ResUpper, ResMayBeQNaN, /*MayBeSNaN=*/false);
+}
+
+ConstantFPRange ConstantFPRange::div(const ConstantFPRange &Other) const {
+ auto &Sem = getSemantics();
+ bool ResMayBeQNaN = ((MayBeQNaN || MayBeSNaN) && !Other.isEmptySet()) ||
+ ((Other.MayBeQNaN || Other.MayBeSNaN) && !isEmptySet());
+ if (isNaNOnly() || Other.isNaNOnly())
+ return getNaNOnly(Sem, /*MayBeQNaN=*/ResMayBeQNaN,
+ /*MayBeSNaN=*/false);
+ std::optional<SameSignRange> LHSNeg, LHSPos, RHSNeg, RHSPos;
+ splitPosNeg(Lower, Upper, LHSNeg, LHSPos);
+ splitPosNeg(Other.Lower, Other.Upper, RHSNeg, RHSPos);
+ APFloat ResLower = APFloat::getInf(Sem, /*Negative=*/false);
+ APFloat ResUpper = APFloat::getInf(Sem, /*Negative=*/true);
+ auto Update = [&](std::optional<SameSignRange> &LHS,
+ std::optional<SameSignRange> &RHS, bool Negative) {
+ if (!LHS || !RHS)
+ return;
+ // inf / inf = QNaN 0 / 0 = QNaN
+ ResMayBeQNaN |= LHS->HasInf && RHS->HasInf;
+ ResMayBeQNaN |= LHS->HasZero && RHS->HasZero;
+ // It is not straightforward to infer HasNonZeroFinite = HasFinite &&
+ // HasNonZero. By definitions we have:
+ // HasFinite = HasNonZeroFinite || HasZero
+ // HasNonZero = HasNonZeroFinite || HasInf
+ // Since the range is contiguous, if both HasFinite and HasNonZero are true,
+ // HasNonZeroFinite must be true.
+ bool LHSHasNonZeroFinite = LHS->FinitePart && LHS->HasNonZero;
+ bool RHSHasNonZeroFinite = RHS->FinitePart && RHS->HasNonZero;
+ // inf / Finite = inf FiniteNonZero / 0 = inf
+ if ((LHS->HasInf && RHS->FinitePart) ||
+ (LHSHasNonZeroFinite && RHS->HasZero))
+ (Negative ? ResLower : ResUpper) = APFloat::getInf(Sem, Negative);
+ // Finite / inf = 0
+ if (LHS->FinitePart && RHS->HasInf) {
+ APFloat Zero = APFloat::getZero(Sem, /*Negative=*/Negative);
+ ResLower = minnum(ResLower, Zero);
+ ResUpper = maxnum(ResUpper, Zero);
+ }
+ // Finite / FiniteNonZero
+ if (LHS->FinitePart && RHSHasNonZeroFinite) {
+ assert(!RHS->FinitePart->second.isZero() &&
+ "Divisor should be non-zero.");
+ APFloat NewLower = LHS->FinitePart->first / RHS->FinitePart->second;
+ APFloat NewUpper = LHS->FinitePart->second /
+ (RHS->FinitePart->first.isZero()
+ ? APFloat::getSmallest(Sem, /*Negative=*/false)
+ : RHS->FinitePart->first);
+ if (Negative) {
+ ResLower = minnum(ResLower, -NewUpper);
+ ResUpper = maxnum(ResUpper, -NewLower);
+ } else {
+ ResLower = minnum(ResLower, NewLower);
+ ResUpper = maxnum(ResUpper, NewUpper);
+ }
+ }
+ };
+ Update(LHSNeg, RHSNeg, /*Negative=*/false);
+ Update(LHSNeg, RHSPos, /*Negative=*/true);
+ Update(LHSPos, RHSNeg, /*Negative=*/true);
+ Update(LHSPos, RHSPos, /*Negative=*/false);
+ return ConstantFPRange(ResLower, ResUpper, ResMayBeQNaN, /*MayBeSNaN=*/false);
+}
diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index 3842b1a..6a9ef2e 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -741,7 +741,8 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, Constant *C1,
assert(!CI2->isZero() && "And zero handled above");
if (ConstantExpr *CE1 = dyn_cast<ConstantExpr>(C1)) {
// If and'ing the address of a global with a constant, fold it.
- if (CE1->getOpcode() == Instruction::PtrToInt &&
+ if ((CE1->getOpcode() == Instruction::PtrToInt ||
+ CE1->getOpcode() == Instruction::PtrToAddr) &&
isa<GlobalValue>(CE1->getOperand(0))) {
GlobalValue *GV = cast<GlobalValue>(CE1->getOperand(0));
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index 2c2950c..cbce8bd 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -667,8 +667,11 @@ Constant::PossibleRelocationsTy Constant::getRelocationInfo() const {
if (CE->getOpcode() == Instruction::Sub) {
ConstantExpr *LHS = dyn_cast<ConstantExpr>(CE->getOperand(0));
ConstantExpr *RHS = dyn_cast<ConstantExpr>(CE->getOperand(1));
- if (LHS && RHS && LHS->getOpcode() == Instruction::PtrToInt &&
- RHS->getOpcode() == Instruction::PtrToInt) {
+ if (LHS && RHS &&
+ (LHS->getOpcode() == Instruction::PtrToInt ||
+ LHS->getOpcode() == Instruction::PtrToAddr) &&
+ (RHS->getOpcode() == Instruction::PtrToInt ||
+ RHS->getOpcode() == Instruction::PtrToAddr)) {
Constant *LHSOp0 = LHS->getOperand(0);
Constant *RHSOp0 = RHS->getOperand(0);
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index 3f1cc1e..27d8294 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -4098,15 +4098,8 @@ LLVMValueRef LLVMBuildGlobalStringPtr(LLVMBuilderRef B, const char *Str,
return wrap(unwrap(B)->CreateGlobalString(Str, Name));
}
-LLVMBool LLVMGetVolatile(LLVMValueRef MemAccessInst) {
- Value *P = unwrap(MemAccessInst);
- if (LoadInst *LI = dyn_cast<LoadInst>(P))
- return LI->isVolatile();
- if (StoreInst *SI = dyn_cast<StoreInst>(P))
- return SI->isVolatile();
- if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(P))
- return AI->isVolatile();
- return cast<AtomicCmpXchgInst>(P)->isVolatile();
+LLVMBool LLVMGetVolatile(LLVMValueRef Inst) {
+ return cast<Instruction>(unwrap(Inst))->isVolatile();
}
void LLVMSetVolatile(LLVMValueRef MemAccessInst, LLVMBool isVolatile) {
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index 9601a8a..5883606 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -294,9 +294,9 @@ void DebugInfoFinder::processSubprogram(DISubprogram *SP) {
// just DISubprogram's, referenced from anywhere within the Function being
// cloned prior to calling MapMetadata / RemapInstruction to avoid their
// duplication later as DICompileUnit's are also directly referenced by
- // llvm.dbg.cu list. Thefore we need to collect DICompileUnit's here as well.
- // Also, DICompileUnit's may reference DISubprogram's too and therefore need
- // to be at least looked through.
+ // llvm.dbg.cu list. Therefore we need to collect DICompileUnit's here as
+ // well. Also, DICompileUnit's may reference DISubprogram's too and therefore
+ // need to be at least looked through.
processCompileUnit(SP->getUnit());
processType(SP->getType());
for (auto *Element : SP->getTemplateParams()) {
@@ -377,7 +377,7 @@ bool DebugInfoFinder::addScope(DIScope *Scope) {
/// Recursively handle DILocations in followup metadata etc.
///
-/// TODO: If for example a followup loop metadata would refence itself this
+/// TODO: If for example a followup loop metadata would reference itself this
/// function would go into infinite recursion. We do not expect such cycles in
/// the loop metadata (except for the self-referencing first element
/// "LoopID"). However, we could at least handle such situations more gracefully
@@ -679,7 +679,7 @@ private:
auto Variables = nullptr;
auto TemplateParams = nullptr;
- // Make a distinct DISubprogram, for situations that warrent it.
+ // Make a distinct DISubprogram, for situations that warrant it.
auto distinctMDSubprogram = [&]() {
return DISubprogram::getDistinct(
MDS->getContext(), FileAndScope, MDS->getName(), LinkageName,
@@ -1095,6 +1095,35 @@ LLVMDIBuilderCreateFile(LLVMDIBuilderRef Builder, const char *Filename,
StringRef(Directory, DirectoryLen)));
}
+static llvm::DIFile::ChecksumKind
+map_from_llvmChecksumKind(LLVMChecksumKind CSKind) {
+ switch (CSKind) {
+ case LLVMChecksumKind::CSK_MD5:
+ return llvm::DIFile::CSK_MD5;
+ case LLVMChecksumKind::CSK_SHA1:
+ return llvm::DIFile::CSK_SHA1;
+ case LLVMChecksumKind::CSK_SHA256:
+ return llvm::DIFile::CSK_SHA256;
+ }
+ llvm_unreachable("Unhandled Checksum Kind");
+}
+
+LLVMMetadataRef LLVMDIBuilderCreateFileWithChecksum(
+ LLVMDIBuilderRef Builder, const char *Filename, size_t FilenameLen,
+ const char *Directory, size_t DirectoryLen, LLVMChecksumKind ChecksumKind,
+ const char *Checksum, size_t ChecksumLen, const char *Source,
+ size_t SourceLen) {
+ StringRef ChkSum = StringRef(Checksum, ChecksumLen);
+ auto CSK = map_from_llvmChecksumKind(ChecksumKind);
+ llvm::DIFile::ChecksumInfo<StringRef> CSInfo(CSK, ChkSum);
+ std::optional<StringRef> Src;
+ if (SourceLen > 0)
+ Src = StringRef(Source, SourceLen);
+ return wrap(unwrap(Builder)->createFile(StringRef(Filename, FilenameLen),
+ StringRef(Directory, DirectoryLen),
+ CSInfo, Src));
+}
+
LLVMMetadataRef
LLVMDIBuilderCreateModule(LLVMDIBuilderRef Builder, LLVMMetadataRef ParentScope,
const char *Name, size_t NameLen,
@@ -2014,7 +2043,7 @@ void at::remapAssignID(DenseMap<DIAssignID *, DIAssignID *> &Map,
I.setMetadata(LLVMContext::MD_DIAssignID, GetNewID(ID));
}
-/// Collect constant properies (base, size, offset) of \p StoreDest.
+/// Collect constant properties (base, size, offset) of \p StoreDest.
/// Return std::nullopt if any properties are not constants or the
/// offset from the base pointer is negative.
static std::optional<AssignmentInfo>
@@ -2300,7 +2329,7 @@ PreservedAnalyses AssignmentTrackingPass::run(Function &F,
return PreservedAnalyses::all();
// Record that this module uses assignment tracking. It doesn't matter that
- // some functons in the module may not use it - the debug info in those
+ // some functions in the module may not use it - the debug info in those
// functions will still be handled properly.
setAssignmentTrackingModuleFlag(*F.getParent());
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index 614c3a9..15c0198 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -25,6 +25,7 @@
#include "llvm/IR/Module.h"
#include "llvm/IR/NoFolder.h"
#include "llvm/IR/Operator.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/IR/Statepoint.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
@@ -1002,6 +1003,18 @@ CallInst *IRBuilderBase::CreateConstrainedFPCall(
return C;
}
+Value *IRBuilderBase::CreateSelectWithUnknownProfile(Value *C, Value *True,
+ Value *False,
+ StringRef PassName,
+ const Twine &Name) {
+ Value *Ret = CreateSelectFMF(C, True, False, {}, Name);
+ if (auto *SI = dyn_cast<SelectInst>(Ret)) {
+ setExplicitlyUnknownBranchWeightsIfProfiled(
+ *SI, *SI->getParent()->getParent(), PassName);
+ }
+ return Ret;
+}
+
Value *IRBuilderBase::CreateSelect(Value *C, Value *True, Value *False,
const Twine &Name, Instruction *MDFrom) {
return CreateSelectFMF(C, True, False, {}, Name, MDFrom);
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 88e7c44..9060a89 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -2965,8 +2965,7 @@ unsigned CastInst::isEliminableCastPair(Instruction::CastOps firstOp,
// zext, sext -> zext, because sext can't sign extend after zext
return Instruction::ZExt;
case 11: {
- // inttoptr, ptrtoint/ptrtoaddr -> bitcast if SrcSize<=PtrSize/AddrSize
- // and SrcSize==DstSize
+ // inttoptr, ptrtoint/ptrtoaddr -> integer cast
if (!DL)
return 0;
unsigned MidSize = secondOp == Instruction::PtrToAddr
@@ -2974,10 +2973,15 @@ unsigned CastInst::isEliminableCastPair(Instruction::CastOps firstOp,
: DL->getPointerTypeSizeInBits(MidTy);
unsigned SrcSize = SrcTy->getScalarSizeInBits();
unsigned DstSize = DstTy->getScalarSizeInBits();
- // TODO: Could also produce zext or trunc here.
- if (SrcSize <= MidSize && SrcSize == DstSize)
- return Instruction::BitCast;
- return 0;
+ // If the middle size is smaller than both source and destination,
+ // an additional masking operation would be required.
+ if (MidSize < SrcSize && MidSize < DstSize)
+ return 0;
+ if (DstSize < SrcSize)
+ return Instruction::Trunc;
+ if (DstSize > SrcSize)
+ return Instruction::ZExt;
+ return Instruction::BitCast;
}
case 12:
// addrspacecast, addrspacecast -> bitcast, if SrcAS == DstAS
diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp
index 9db48e8..0e9535d 100644
--- a/llvm/lib/IR/Type.cpp
+++ b/llvm/lib/IR/Type.cpp
@@ -1034,6 +1034,10 @@ static TargetTypeInfo getTargetTypeInfo(const TargetExtType *Ty) {
}
// DirectX resources
+ if (Name == "dx.Padding")
+ return TargetTypeInfo(
+ ArrayType::get(Type::getInt8Ty(C), Ty->getIntParameter(0)),
+ TargetExtType::CanBeGlobal);
if (Name.starts_with("dx."))
return TargetTypeInfo(PointerType::get(C, 0), TargetExtType::CanBeGlobal,
TargetExtType::CanBeLocal,
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index c9ff86b..3572852 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -893,7 +893,7 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
if (GV.hasInitializer()) {
const Constant *Init = GV.getInitializer();
const ConstantArray *InitArray = dyn_cast<ConstantArray>(Init);
- Check(InitArray, "wrong initalizer for intrinsic global variable",
+ Check(InitArray, "wrong initializer for intrinsic global variable",
Init);
for (Value *Op : InitArray->operands()) {
Value *V = Op->stripPointerCasts();
@@ -6479,9 +6479,12 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
NumRows->getZExtValue() * NumColumns->getZExtValue(),
"Result of a matrix operation does not fit in the returned vector!");
- if (Stride)
+ if (Stride) {
+ Check(Stride->getBitWidth() <= 64, "Stride bitwidth cannot exceed 64!",
+ IF);
Check(Stride->getZExtValue() >= NumRows->getZExtValue(),
"Stride must be greater or equal than the number of rows!", IF);
+ }
break;
}
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index e6544f3..aec8891 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1257,38 +1257,6 @@ Error LTO::run(AddStreamFn AddStream, FileCache Cache) {
return Result;
}
-void lto::updateMemProfAttributes(Module &Mod,
- const ModuleSummaryIndex &Index) {
- llvm::TimeTraceScope timeScope("LTO update memprof attributes");
- if (Index.withSupportsHotColdNew())
- return;
-
- // The profile matcher applies hotness attributes directly for allocations,
- // and those will cause us to generate calls to the hot/cold interfaces
- // unconditionally. If supports-hot-cold-new was not enabled in the LTO
- // link then assume we don't want these calls (e.g. not linking with
- // the appropriate library, or otherwise trying to disable this behavior).
- for (auto &F : Mod) {
- for (auto &BB : F) {
- for (auto &I : BB) {
- auto *CI = dyn_cast<CallBase>(&I);
- if (!CI)
- continue;
- if (CI->hasFnAttr("memprof"))
- CI->removeFnAttr("memprof");
- // Strip off all memprof metadata as it is no longer needed.
- // Importantly, this avoids the addition of new memprof attributes
- // after inlining propagation.
- // TODO: If we support additional types of MemProf metadata beyond hot
- // and cold, we will need to update the metadata based on the allocator
- // APIs supported instead of completely stripping all.
- CI->setMetadata(LLVMContext::MD_memprof, nullptr);
- CI->setMetadata(LLVMContext::MD_callsite, nullptr);
- }
- }
- }
-}
-
Error LTO::runRegularLTO(AddStreamFn AddStream) {
llvm::TimeTraceScope timeScope("Run regular LTO");
LLVMContext &CombinedCtx = RegularLTO.CombinedModule->getContext();
@@ -1346,8 +1314,6 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) {
}
}
- updateMemProfAttributes(*RegularLTO.CombinedModule, ThinLTO.CombinedIndex);
-
bool WholeProgramVisibilityEnabledInLTO =
Conf.HasWholeProgramVisibility &&
// If validation is enabled, upgrade visibility only when all vtables
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index 11a7b32..280c3d1 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -726,7 +726,6 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
}
// Do this after any importing so that imported code is updated.
- updateMemProfAttributes(Mod, CombinedIndex);
updatePublicTypeTestCalls(Mod, CombinedIndex.withWholeProgramVisibility());
if (Conf.PostImportModuleHook && !Conf.PostImportModuleHook(Task, Mod))
diff --git a/llvm/lib/MC/MCObjectFileInfo.cpp b/llvm/lib/MC/MCObjectFileInfo.cpp
index a755c22..aee3c3b 100644
--- a/llvm/lib/MC/MCObjectFileInfo.cpp
+++ b/llvm/lib/MC/MCObjectFileInfo.cpp
@@ -553,7 +553,8 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T, bool Large) {
SFrameSection =
Ctx->getELFSection(".sframe", ELF::SHT_GNU_SFRAME, ELF::SHF_ALLOC);
- CallGraphSection = Ctx->getELFSection(".callgraph", ELF::SHT_PROGBITS, 0);
+ CallGraphSection =
+ Ctx->getELFSection(".llvm.callgraph", ELF::SHT_PROGBITS, 0);
StackSizesSection = Ctx->getELFSection(".stack_sizes", ELF::SHT_PROGBITS, 0);
@@ -1171,8 +1172,8 @@ MCObjectFileInfo::getCallGraphSection(const MCSection &TextSec) const {
}
return Ctx->getELFSection(
- ".callgraph", ELF::SHT_PROGBITS, Flags, 0, GroupName, true,
- ElfSec.getUniqueID(),
+ ".llvm.callgraph", ELF::SHT_PROGBITS, Flags, 0, GroupName,
+ /*IsComdat=*/true, ElfSec.getUniqueID(),
static_cast<const MCSymbolELF *>(TextSec.getBeginSymbol()));
}
diff --git a/llvm/lib/ObjCopy/ConfigManager.cpp b/llvm/lib/ObjCopy/ConfigManager.cpp
index eef8a21..6b7b4f1 100644
--- a/llvm/lib/ObjCopy/ConfigManager.cpp
+++ b/llvm/lib/ObjCopy/ConfigManager.cpp
@@ -122,14 +122,14 @@ ConfigManager::getDXContainerConfig() const {
if (!Common.AddGnuDebugLink.empty() || !Common.SplitDWO.empty() ||
!Common.AllocSectionsPrefix.empty() ||
Common.DiscardMode != DiscardType::None || !Common.AddSection.empty() ||
- !Common.DumpSection.empty() || !Common.KeepSection.empty() ||
- !Common.SectionsToRename.empty() || !Common.SetSectionAlignment.empty() ||
- !Common.SetSectionFlags.empty() || !Common.SetSectionType.empty() ||
- Common.ExtractDWO || Common.OnlyKeepDebug || Common.StripAllGNU ||
- Common.StripDWO || Common.StripDebug || Common.StripNonAlloc ||
- Common.StripSections || Common.StripUnneeded ||
- Common.DecompressDebugSections || Common.GapFill != 0 ||
- Common.PadTo != 0 || Common.ChangeSectionLMAValAll != 0 ||
+ !Common.KeepSection.empty() || !Common.SectionsToRename.empty() ||
+ !Common.SetSectionAlignment.empty() || !Common.SetSectionFlags.empty() ||
+ !Common.SetSectionType.empty() || Common.ExtractDWO ||
+ Common.OnlyKeepDebug || Common.StripAllGNU || Common.StripDWO ||
+ Common.StripDebug || Common.StripNonAlloc || Common.StripSections ||
+ Common.StripUnneeded || Common.DecompressDebugSections ||
+ Common.GapFill != 0 || Common.PadTo != 0 ||
+ Common.ChangeSectionLMAValAll != 0 ||
!Common.ChangeSectionAddress.empty()) {
return createStringError(llvm::errc::invalid_argument,
"option is not supported for DXContainer");
diff --git a/llvm/lib/ObjCopy/DXContainer/DXContainerObjcopy.cpp b/llvm/lib/ObjCopy/DXContainer/DXContainerObjcopy.cpp
index d7f3c0d..95ab3d9 100644
--- a/llvm/lib/ObjCopy/DXContainer/DXContainerObjcopy.cpp
+++ b/llvm/lib/ObjCopy/DXContainer/DXContainerObjcopy.cpp
@@ -9,8 +9,10 @@
#include "llvm/ObjCopy/DXContainer/DXContainerObjcopy.h"
#include "DXContainerReader.h"
#include "DXContainerWriter.h"
+#include "llvm/BinaryFormat/DXContainer.h"
#include "llvm/ObjCopy/CommonConfig.h"
#include "llvm/ObjCopy/DXContainer/DXContainerConfig.h"
+#include "llvm/Support/FileOutputBuffer.h"
#include "llvm/Support/raw_ostream.h"
namespace llvm {
@@ -42,7 +44,47 @@ static Error extractPartAsObject(StringRef PartName, StringRef OutFilename,
"part '%s' not found", PartName.str().c_str());
}
+static Error dumpPartToFile(StringRef PartName, StringRef Filename,
+ StringRef InputFilename, Object &Obj) {
+ auto PartIter = llvm::find_if(
+ Obj.Parts, [&PartName](const Part &P) { return P.Name == PartName; });
+ if (PartIter == Obj.Parts.end())
+ return createFileError(Filename,
+ std::make_error_code(std::errc::invalid_argument),
+ "part '%s' not found", PartName.str().c_str());
+ ArrayRef<uint8_t> Contents = PartIter->Data;
+ // The DXContainer format is a bit odd because the part-specific headers are
+ // contained inside the part data itself. For parts that contain LLVM bitcode
+ // when we dump the part we want to skip the part-specific header so that we
+ // get a valid .bc file that we can inspect. All the data contained inside the
+ // program header is pulled out of the bitcode, so the header can be
+ // reconstructed if needed from the bitcode itself. More comprehensive
+ // documentation on the DXContainer format can be found at
+ // https://llvm.org/docs/DirectX/DXContainer.html.
+
+ if (PartName == "DXIL" || PartName == "STAT")
+ Contents = Contents.drop_front(sizeof(llvm::dxbc::ProgramHeader));
+ if (Contents.empty())
+ return createFileError(Filename, object_error::parse_failed,
+ "part '%s' is empty", PartName.str().c_str());
+ Expected<std::unique_ptr<FileOutputBuffer>> BufferOrErr =
+ FileOutputBuffer::create(Filename, Contents.size());
+ if (!BufferOrErr)
+ return createFileError(Filename, BufferOrErr.takeError());
+ std::unique_ptr<FileOutputBuffer> Buf = std::move(*BufferOrErr);
+ llvm::copy(Contents, Buf->getBufferStart());
+ if (Error E = Buf->commit())
+ return createFileError(Filename, std::move(E));
+ return Error::success();
+}
+
static Error handleArgs(const CommonConfig &Config, Object &Obj) {
+ for (StringRef Flag : Config.DumpSection) {
+ auto [SecName, FileName] = Flag.split("=");
+ if (Error E = dumpPartToFile(SecName, FileName, Config.InputFilename, Obj))
+ return E;
+ }
+
// Extract all sections before any modifications.
for (StringRef Flag : Config.ExtractSection) {
StringRef SectionName;
diff --git a/llvm/lib/Object/ELF.cpp b/llvm/lib/Object/ELF.cpp
index 53699ce0..f256e7b 100644
--- a/llvm/lib/Object/ELF.cpp
+++ b/llvm/lib/Object/ELF.cpp
@@ -837,7 +837,7 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF,
Version = Data.getU8(Cur);
if (!Cur)
break;
- if (Version < 2 || Version > 3)
+ if (Version < 2 || Version > 4)
return createError("unsupported SHT_LLVM_BB_ADDR_MAP version: " +
Twine(static_cast<int>(Version)));
Feature = Data.getU8(Cur); // Feature byte
@@ -852,6 +852,11 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF,
"callsite offsets feature is enabled: version = " +
Twine(static_cast<int>(Version)) +
" feature = " + Twine(static_cast<int>(Feature)));
+ if (FeatEnable.BBHash && Version < 4)
+ return createError("version should be >= 4 for SHT_LLVM_BB_ADDR_MAP when "
+ "basic block hash feature is enabled: version = " +
+ Twine(static_cast<int>(Version)) +
+ " feature = " + Twine(static_cast<int>(Feature)));
uint32_t NumBlocksInBBRange = 0;
uint32_t NumBBRanges = 1;
typename ELFFile<ELFT>::uintX_t RangeBaseAddress = 0;
@@ -907,6 +912,7 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF,
uint32_t Size = readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr) +
LastCallsiteEndOffset;
uint32_t MD = readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr);
+ uint64_t Hash = FeatEnable.BBHash ? Data.getU64(Cur) : 0;
Expected<BBAddrMap::BBEntry::Metadata> MetadataOrErr =
BBAddrMap::BBEntry::Metadata::decode(MD);
if (!MetadataOrErr) {
@@ -914,7 +920,7 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF,
break;
}
BBEntries.push_back({ID, Offset + PrevBBEndOffset, Size,
- *MetadataOrErr, CallsiteEndOffsets});
+ *MetadataOrErr, CallsiteEndOffsets, Hash});
PrevBBEndOffset += Offset + Size;
}
TotalNumBlocks += BBEntries.size();
diff --git a/llvm/lib/ObjectYAML/ELFEmitter.cpp b/llvm/lib/ObjectYAML/ELFEmitter.cpp
index faeeab3..8b75fbe 100644
--- a/llvm/lib/ObjectYAML/ELFEmitter.cpp
+++ b/llvm/lib/ObjectYAML/ELFEmitter.cpp
@@ -1465,7 +1465,7 @@ void ELFState<ELFT>::writeSectionContent(
for (const auto &[Idx, E] : llvm::enumerate(*Section.Entries)) {
// Write version and feature values.
if (Section.Type == llvm::ELF::SHT_LLVM_BB_ADDR_MAP) {
- if (E.Version > 3)
+ if (E.Version > 4)
WithColor::warning() << "unsupported SHT_LLVM_BB_ADDR_MAP version: "
<< static_cast<int>(E.Version)
<< "; encoding using the most recent version";
@@ -1526,6 +1526,12 @@ void ELFState<ELFT>::writeSectionContent(
}
SHeader.sh_size += CBA.writeULEB128(BBE.Size);
SHeader.sh_size += CBA.writeULEB128(BBE.Metadata);
+ if (FeatureOrErr->BBHash || BBE.Hash.has_value()) {
+ uint64_t Hash =
+ BBE.Hash.has_value() ? BBE.Hash.value() : llvm::yaml::Hex64(0);
+ CBA.write<uint64_t>(Hash, ELFT::Endianness);
+ SHeader.sh_size += 8;
+ }
}
}
if (!PGOAnalyses)
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index d9cce1e..c3a27c9 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -488,6 +488,7 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
BCaseMask(EF_HEXAGON_MACH_V5, EF_HEXAGON_MACH);
BCaseMask(EF_HEXAGON_MACH_V55, EF_HEXAGON_MACH);
BCaseMask(EF_HEXAGON_MACH_V60, EF_HEXAGON_MACH);
+ BCaseMask(EF_HEXAGON_MACH_V61, EF_HEXAGON_MACH);
BCaseMask(EF_HEXAGON_MACH_V62, EF_HEXAGON_MACH);
BCaseMask(EF_HEXAGON_MACH_V65, EF_HEXAGON_MACH);
BCaseMask(EF_HEXAGON_MACH_V66, EF_HEXAGON_MACH);
@@ -499,12 +500,21 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
BCaseMask(EF_HEXAGON_MACH_V71T, EF_HEXAGON_MACH);
BCaseMask(EF_HEXAGON_MACH_V73, EF_HEXAGON_MACH);
BCaseMask(EF_HEXAGON_MACH_V75, EF_HEXAGON_MACH);
+ BCaseMask(EF_HEXAGON_MACH_V77, EF_HEXAGON_MACH);
+ BCaseMask(EF_HEXAGON_MACH_V79, EF_HEXAGON_MACH);
+ BCaseMask(EF_HEXAGON_MACH_V81, EF_HEXAGON_MACH);
+ BCaseMask(EF_HEXAGON_MACH_V83, EF_HEXAGON_MACH);
+ BCaseMask(EF_HEXAGON_MACH_V85, EF_HEXAGON_MACH);
+ BCaseMask(EF_HEXAGON_MACH_V87, EF_HEXAGON_MACH);
+ BCaseMask(EF_HEXAGON_MACH_V89, EF_HEXAGON_MACH);
+ BCaseMask(EF_HEXAGON_MACH_V91, EF_HEXAGON_MACH);
BCaseMask(EF_HEXAGON_ISA_V2, EF_HEXAGON_ISA);
BCaseMask(EF_HEXAGON_ISA_V3, EF_HEXAGON_ISA);
BCaseMask(EF_HEXAGON_ISA_V4, EF_HEXAGON_ISA);
BCaseMask(EF_HEXAGON_ISA_V5, EF_HEXAGON_ISA);
BCaseMask(EF_HEXAGON_ISA_V55, EF_HEXAGON_ISA);
BCaseMask(EF_HEXAGON_ISA_V60, EF_HEXAGON_ISA);
+ BCaseMask(EF_HEXAGON_ISA_V61, EF_HEXAGON_ISA);
BCaseMask(EF_HEXAGON_ISA_V62, EF_HEXAGON_ISA);
BCaseMask(EF_HEXAGON_ISA_V65, EF_HEXAGON_ISA);
BCaseMask(EF_HEXAGON_ISA_V66, EF_HEXAGON_ISA);
@@ -514,6 +524,14 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
BCaseMask(EF_HEXAGON_ISA_V71, EF_HEXAGON_ISA);
BCaseMask(EF_HEXAGON_ISA_V73, EF_HEXAGON_ISA);
BCaseMask(EF_HEXAGON_ISA_V75, EF_HEXAGON_ISA);
+ BCaseMask(EF_HEXAGON_ISA_V77, EF_HEXAGON_ISA);
+ BCaseMask(EF_HEXAGON_ISA_V79, EF_HEXAGON_ISA);
+ BCaseMask(EF_HEXAGON_ISA_V81, EF_HEXAGON_ISA);
+ BCaseMask(EF_HEXAGON_ISA_V83, EF_HEXAGON_ISA);
+ BCaseMask(EF_HEXAGON_ISA_V85, EF_HEXAGON_ISA);
+ BCaseMask(EF_HEXAGON_ISA_V87, EF_HEXAGON_ISA);
+ BCaseMask(EF_HEXAGON_ISA_V89, EF_HEXAGON_ISA);
+ BCaseMask(EF_HEXAGON_ISA_V91, EF_HEXAGON_ISA);
break;
case ELF::EM_AVR:
BCaseMask(EF_AVR_ARCH_AVR1, EF_AVR_ARCH_MASK);
@@ -1887,6 +1905,7 @@ void MappingTraits<ELFYAML::BBAddrMapEntry::BBEntry>::mapping(
IO.mapRequired("Size", E.Size);
IO.mapRequired("Metadata", E.Metadata);
IO.mapOptional("CallsiteEndOffsets", E.CallsiteEndOffsets);
+ IO.mapOptional("Hash", E.Hash);
}
void MappingTraits<ELFYAML::PGOAnalysisMapEntry>::mapping(
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index fea0d25..3f3939eaf 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1658,6 +1658,16 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
ModulePassManager MPM;
+ // Currently this pipeline is only invoked in an LTO pre link pass or when we
+ // are not running LTO. If that changes the below checks may need updating.
+ assert(isLTOPreLink(Phase) || Phase == ThinOrFullLTOPhase::None);
+
+ // If we are invoking this in non-LTO mode, remove any MemProf related
+ // attributes and metadata, as we don't know whether we are linking with
+ // a library containing the necessary interfaces.
+ if (Phase == ThinOrFullLTOPhase::None)
+ MPM.addPass(MemProfRemoveInfo());
+
// Convert @llvm.global.annotations to !annotation metadata.
MPM.addPass(Annotation2MetadataPass());
@@ -1803,6 +1813,12 @@ ModulePassManager PassBuilder::buildThinLTODefaultPipeline(
OptimizationLevel Level, const ModuleSummaryIndex *ImportSummary) {
ModulePassManager MPM;
+ // If we are invoking this without a summary index noting that we are linking
+ // with a library containing the necessary APIs, remove any MemProf related
+ // attributes and metadata.
+ if (!ImportSummary || !ImportSummary->withSupportsHotColdNew())
+ MPM.addPass(MemProfRemoveInfo());
+
if (ImportSummary) {
// For ThinLTO we must apply the context disambiguation decisions early, to
// ensure we can correctly match the callsites to summary data.
@@ -1874,6 +1890,12 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
invokeFullLinkTimeOptimizationEarlyEPCallbacks(MPM, Level);
+ // If we are invoking this without a summary index noting that we are linking
+ // with a library containing the necessary APIs, remove any MemProf related
+ // attributes and metadata.
+ if (!ExportSummary || !ExportSummary->withSupportsHotColdNew())
+ MPM.addPass(MemProfRemoveInfo());
+
// Create a function that performs CFI checks for cross-DSO calls with targets
// in the current module.
MPM.addPass(CrossDSOCFIPass());
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 1b16525..884d8da 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -113,6 +113,7 @@ MODULE_PASS("pgo-force-function-attrs",
? PGOOpt->ColdOptType
: PGOOptions::ColdFuncOpt::Default))
MODULE_PASS("memprof-context-disambiguation", MemProfContextDisambiguation())
+MODULE_PASS("memprof-remove-attributes", MemProfRemoveInfo())
MODULE_PASS("memprof-module", ModuleMemProfilerPass())
MODULE_PASS("mergefunc", MergeFunctionsPass())
MODULE_PASS("metarenamer", MetaRenamerPass())
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 3c8e44a..0208735 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -302,7 +302,7 @@ void ProfOStream::patch(ArrayRef<PatchItem> P) {
std::string getPGOFuncName(StringRef Name, GlobalValue::LinkageTypes Linkage,
StringRef FileName,
- uint64_t Version LLVM_ATTRIBUTE_UNUSED) {
+ [[maybe_unused]] uint64_t Version) {
// Value names may be prefixed with a binary '1' to indicate
// that the backend should not modify the symbols due to any platform
// naming convention. Do not include that '1' in the PGO profile name.
diff --git a/llvm/lib/Remarks/BitstreamRemarkParser.h b/llvm/lib/Remarks/BitstreamRemarkParser.h
index 4f66c47..914edd8 100644
--- a/llvm/lib/Remarks/BitstreamRemarkParser.h
+++ b/llvm/lib/Remarks/BitstreamRemarkParser.h
@@ -112,7 +112,7 @@ public:
/// Helper to parse a META_BLOCK for a bitstream remark container.
class BitstreamMetaParserHelper
: public BitstreamBlockParserHelper<BitstreamMetaParserHelper> {
- friend class BitstreamBlockParserHelper;
+ friend class BitstreamBlockParserHelper<BitstreamMetaParserHelper>;
public:
struct ContainerInfo {
@@ -137,7 +137,7 @@ protected:
/// Helper to parse a REMARK_BLOCK for a bitstream remark container.
class BitstreamRemarkParserHelper
: public BitstreamBlockParserHelper<BitstreamRemarkParserHelper> {
- friend class BitstreamBlockParserHelper;
+ friend class BitstreamBlockParserHelper<BitstreamRemarkParserHelper>;
protected:
SmallVector<uint64_t, 5> Record;
diff --git a/llvm/lib/Support/DebugCounter.cpp b/llvm/lib/Support/DebugCounter.cpp
index 6b65720..5ab1def 100644
--- a/llvm/lib/Support/DebugCounter.cpp
+++ b/llvm/lib/Support/DebugCounter.cpp
@@ -136,6 +136,13 @@ struct DebugCounterOwner : DebugCounter {
cl::location(this->ShouldPrintCounter),
cl::init(false),
cl::desc("Print out debug counter info after all counters accumulated")};
+ cl::opt<bool, true> PrintDebugCounterQueries{
+ "print-debug-counter-queries",
+ cl::Hidden,
+ cl::Optional,
+ cl::location(this->ShouldPrintCounterQueries),
+ cl::init(false),
+ cl::desc("Print out each query of an enabled debug counter")};
cl::opt<bool, true> BreakOnLastCount{
"debug-counter-break-on-last",
cl::Hidden,
@@ -221,31 +228,40 @@ void DebugCounter::print(raw_ostream &OS) const {
}
}
+bool DebugCounter::handleCounterIncrement(CounterInfo &Info) {
+ int64_t CurrCount = Info.Count++;
+ uint64_t CurrIdx = Info.CurrChunkIdx;
+
+ if (Info.Chunks.empty())
+ return true;
+ if (CurrIdx >= Info.Chunks.size())
+ return false;
+
+ bool Res = Info.Chunks[CurrIdx].contains(CurrCount);
+ if (BreakOnLast && CurrIdx == (Info.Chunks.size() - 1) &&
+ CurrCount == Info.Chunks[CurrIdx].End) {
+ LLVM_BUILTIN_DEBUGTRAP;
+ }
+ if (CurrCount > Info.Chunks[CurrIdx].End) {
+ Info.CurrChunkIdx++;
+
+ /// Handle consecutive blocks.
+ if (Info.CurrChunkIdx < Info.Chunks.size() &&
+ CurrCount == Info.Chunks[Info.CurrChunkIdx].Begin)
+ return true;
+ }
+ return Res;
+}
+
bool DebugCounter::shouldExecuteImpl(unsigned CounterName) {
auto &Us = instance();
auto Result = Us.Counters.find(CounterName);
if (Result != Us.Counters.end()) {
auto &CounterInfo = Result->second;
- int64_t CurrCount = CounterInfo.Count++;
- uint64_t CurrIdx = CounterInfo.CurrChunkIdx;
-
- if (CounterInfo.Chunks.empty())
- return true;
- if (CurrIdx >= CounterInfo.Chunks.size())
- return false;
-
- bool Res = CounterInfo.Chunks[CurrIdx].contains(CurrCount);
- if (Us.BreakOnLast && CurrIdx == (CounterInfo.Chunks.size() - 1) &&
- CurrCount == CounterInfo.Chunks[CurrIdx].End) {
- LLVM_BUILTIN_DEBUGTRAP;
- }
- if (CurrCount > CounterInfo.Chunks[CurrIdx].End) {
- CounterInfo.CurrChunkIdx++;
-
- /// Handle consecutive blocks.
- if (CounterInfo.CurrChunkIdx < CounterInfo.Chunks.size() &&
- CurrCount == CounterInfo.Chunks[CounterInfo.CurrChunkIdx].Begin)
- return true;
+ bool Res = Us.handleCounterIncrement(CounterInfo);
+ if (Us.ShouldPrintCounterQueries && CounterInfo.IsSet) {
+ dbgs() << "DebugCounter " << Us.RegisteredCounters[CounterName] << "="
+ << (CounterInfo.Count - 1) << (Res ? " execute" : " skip") << "\n";
}
return Res;
}
diff --git a/llvm/lib/Support/PrettyStackTrace.cpp b/llvm/lib/Support/PrettyStackTrace.cpp
index 82b0e6a..eff9947 100644
--- a/llvm/lib/Support/PrettyStackTrace.cpp
+++ b/llvm/lib/Support/PrettyStackTrace.cpp
@@ -141,7 +141,7 @@ extern "C" const char *__crashreporter_info__
asm(".desc ___crashreporter_info__, 0x10");
#endif
-static void setCrashLogMessage(const char *msg) LLVM_ATTRIBUTE_UNUSED;
+[[maybe_unused]] static void setCrashLogMessage(const char *msg);
static void setCrashLogMessage(const char *msg) {
#ifdef HAVE_CRASHREPORTERCLIENT_H
(void)CRSetCrashLogMessage(msg);
diff --git a/llvm/lib/Support/SourceMgr.cpp b/llvm/lib/Support/SourceMgr.cpp
index a43cf37a..299615a 100644
--- a/llvm/lib/Support/SourceMgr.cpp
+++ b/llvm/lib/Support/SourceMgr.cpp
@@ -24,6 +24,7 @@
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/SMLoc.h"
+#include "llvm/Support/VirtualFileSystem.h"
#include "llvm/Support/WithColor.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
@@ -38,6 +39,22 @@ using namespace llvm;
static const size_t TabStop = 8;
+// Out of line to avoid needing definition of vfs::FileSystem in header.
+SourceMgr::SourceMgr() = default;
+SourceMgr::SourceMgr(IntrusiveRefCntPtr<vfs::FileSystem> FS)
+ : FS(std::move(FS)) {}
+SourceMgr::SourceMgr(SourceMgr &&) = default;
+SourceMgr &SourceMgr::operator=(SourceMgr &&) = default;
+SourceMgr::~SourceMgr() = default;
+
+IntrusiveRefCntPtr<vfs::FileSystem> SourceMgr::getVirtualFileSystem() const {
+ return FS;
+}
+
+void SourceMgr::setVirtualFileSystem(IntrusiveRefCntPtr<vfs::FileSystem> FS) {
+ this->FS = std::move(FS);
+}
+
unsigned SourceMgr::AddIncludeFile(const std::string &Filename,
SMLoc IncludeLoc,
std::string &IncludedFile) {
@@ -52,8 +69,11 @@ unsigned SourceMgr::AddIncludeFile(const std::string &Filename,
ErrorOr<std::unique_ptr<MemoryBuffer>>
SourceMgr::OpenIncludeFile(const std::string &Filename,
std::string &IncludedFile) {
- ErrorOr<std::unique_ptr<MemoryBuffer>> NewBufOrErr =
- MemoryBuffer::getFile(Filename);
+ auto GetFile = [this](StringRef Path) {
+ return FS ? FS->getBufferForFile(Path) : MemoryBuffer::getFile(Path);
+ };
+
+ ErrorOr<std::unique_ptr<MemoryBuffer>> NewBufOrErr = GetFile(Filename);
SmallString<64> Buffer(Filename);
// If the file didn't exist directly, see if it's in an include path.
@@ -61,7 +81,7 @@ SourceMgr::OpenIncludeFile(const std::string &Filename,
++i) {
Buffer = IncludeDirectories[i];
sys::path::append(Buffer, Filename);
- NewBufOrErr = MemoryBuffer::getFile(Buffer);
+ NewBufOrErr = GetFile(Buffer);
}
if (NewBufOrErr)
diff --git a/llvm/lib/Support/SpecialCaseList.cpp b/llvm/lib/Support/SpecialCaseList.cpp
index 80fd485..549c418 100644
--- a/llvm/lib/Support/SpecialCaseList.cpp
+++ b/llvm/lib/Support/SpecialCaseList.cpp
@@ -55,12 +55,20 @@ Error SpecialCaseList::RegexMatcher::insert(StringRef Pattern,
return Error::success();
}
+void SpecialCaseList::RegexMatcher::preprocess(bool BySize) {
+ if (BySize) {
+ llvm::stable_sort(RegExes, [](const Reg &A, const Reg &B) {
+ return A.Name.size() < B.Name.size();
+ });
+ }
+}
+
void SpecialCaseList::RegexMatcher::match(
StringRef Query,
llvm::function_ref<void(StringRef Rule, unsigned LineNo)> Cb) const {
for (const auto &R : reverse(RegExes))
if (R.Rg.match(Query))
- Cb(R.Name, R.LineNo);
+ return Cb(R.Name, R.LineNo);
}
Error SpecialCaseList::GlobMatcher::insert(StringRef Pattern,
@@ -75,12 +83,20 @@ Error SpecialCaseList::GlobMatcher::insert(StringRef Pattern,
return Error::success();
}
+void SpecialCaseList::GlobMatcher::preprocess(bool BySize) {
+ if (BySize) {
+ llvm::stable_sort(Globs, [](const Glob &A, const Glob &B) {
+ return A.Name.size() < B.Name.size();
+ });
+ }
+}
+
void SpecialCaseList::GlobMatcher::match(
StringRef Query,
llvm::function_ref<void(StringRef Rule, unsigned LineNo)> Cb) const {
for (const auto &G : reverse(Globs))
if (G.Pattern.match(Query))
- Cb(G.Name, G.LineNo);
+ return Cb(G.Name, G.LineNo);
}
SpecialCaseList::Matcher::Matcher(bool UseGlobs, bool RemoveDotSlash)
@@ -91,6 +107,14 @@ SpecialCaseList::Matcher::Matcher(bool UseGlobs, bool RemoveDotSlash)
M.emplace<RegexMatcher>();
}
+Error SpecialCaseList::Matcher::insert(StringRef Pattern, unsigned LineNumber) {
+ return std::visit([&](auto &V) { return V.insert(Pattern, LineNumber); }, M);
+}
+
+LLVM_ABI void SpecialCaseList::Matcher::preprocess(bool BySize) {
+ return std::visit([&](auto &V) { return V.preprocess(BySize); }, M);
+}
+
void SpecialCaseList::Matcher::match(
StringRef Query,
llvm::function_ref<void(StringRef Rule, unsigned LineNo)> Cb) const {
@@ -99,10 +123,6 @@ void SpecialCaseList::Matcher::match(
return std::visit([&](auto &V) { return V.match(Query, Cb); }, M);
}
-Error SpecialCaseList::Matcher::insert(StringRef Pattern, unsigned LineNumber) {
- return std::visit([&](auto &V) { return V.insert(Pattern, LineNumber); }, M);
-}
-
// TODO: Refactor this to return Expected<...>
std::unique_ptr<SpecialCaseList>
SpecialCaseList::create(const std::vector<std::string> &Paths,
@@ -141,7 +161,7 @@ bool SpecialCaseList::createInternal(const std::vector<std::string> &Paths,
return false;
}
std::string ParseError;
- if (!parse(i, FileOrErr.get().get(), ParseError)) {
+ if (!parse(i, FileOrErr.get().get(), ParseError, /*OrderBySize=*/false)) {
Error = (Twine("error parsing file '") + Path + "': " + ParseError).str();
return false;
}
@@ -149,9 +169,9 @@ bool SpecialCaseList::createInternal(const std::vector<std::string> &Paths,
return true;
}
-bool SpecialCaseList::createInternal(const MemoryBuffer *MB,
- std::string &Error) {
- if (!parse(0, MB, Error))
+bool SpecialCaseList::createInternal(const MemoryBuffer *MB, std::string &Error,
+ bool OrderBySize) {
+ if (!parse(0, MB, Error, OrderBySize))
return false;
return true;
}
@@ -174,7 +194,7 @@ SpecialCaseList::addSection(StringRef SectionStr, unsigned FileNo,
}
bool SpecialCaseList::parse(unsigned FileIdx, const MemoryBuffer *MB,
- std::string &Error) {
+ std::string &Error, bool OrderBySize) {
unsigned long long Version = 2;
StringRef Header = MB->getBuffer();
@@ -246,6 +266,10 @@ bool SpecialCaseList::parse(unsigned FileIdx, const MemoryBuffer *MB,
return false;
}
}
+
+ for (Section &S : Sections)
+ S.preprocess(OrderBySize);
+
return true;
}
@@ -283,6 +307,13 @@ SpecialCaseList::Section::findMatcher(StringRef Prefix,
return &II->second;
}
+LLVM_ABI void SpecialCaseList::Section::preprocess(bool OrderBySize) {
+ SectionMatcher.preprocess(false);
+ for (auto &[K1, E] : Entries)
+ for (auto &[K2, M] : E)
+ M.preprocess(OrderBySize);
+}
+
unsigned SpecialCaseList::Section::getLastMatch(StringRef Prefix,
StringRef Query,
StringRef Category) const {
diff --git a/llvm/lib/Support/TextEncoding.cpp b/llvm/lib/Support/TextEncoding.cpp
index 804ff07..41f5187 100644
--- a/llvm/lib/Support/TextEncoding.cpp
+++ b/llvm/lib/Support/TextEncoding.cpp
@@ -54,9 +54,9 @@ static std::optional<TextEncoding> getKnownEncoding(StringRef Name) {
return std::nullopt;
}
-LLVM_ATTRIBUTE_UNUSED static void
-HandleOverflow(size_t &Capacity, char *&Output, size_t &OutputLength,
- SmallVectorImpl<char> &Result) {
+[[maybe_unused]] static void HandleOverflow(size_t &Capacity, char *&Output,
+ size_t &OutputLength,
+ SmallVectorImpl<char> &Result) {
// No space left in output buffer. Double the size of the underlying
// memory in the SmallVectorImpl, adjust pointer and length and continue
// the conversion.
diff --git a/llvm/lib/Support/UnicodeNameToCodepoint.cpp b/llvm/lib/Support/UnicodeNameToCodepoint.cpp
index 8d66348..6f8e091 100644
--- a/llvm/lib/Support/UnicodeNameToCodepoint.cpp
+++ b/llvm/lib/Support/UnicodeNameToCodepoint.cpp
@@ -476,7 +476,7 @@ nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount) {
std::min(NormalizedName.size(), UnicodeNameToCodepointLargestNameSize) +
1;
- LLVM_ATTRIBUTE_UNUSED static std::size_t Rows =
+ [[maybe_unused]] static std::size_t Rows =
UnicodeNameToCodepointLargestNameSize + 1;
std::vector<char> Distances(
diff --git a/llvm/lib/Support/VirtualOutputBackends.cpp b/llvm/lib/Support/VirtualOutputBackends.cpp
index d6d7b87..de59b8a 100644
--- a/llvm/lib/Support/VirtualOutputBackends.cpp
+++ b/llvm/lib/Support/VirtualOutputBackends.cpp
@@ -498,7 +498,7 @@ Error OnDiskOutputFile::keep() {
// Someone else owns the lock on this file, wait.
switch (Lock.waitForUnlockFor(std::chrono::seconds(256))) {
case WaitForUnlockResult::Success:
- LLVM_FALLTHROUGH;
+ [[fallthrough]];
case WaitForUnlockResult::OwnerDied: {
continue; // try again to get the lock.
}
diff --git a/llvm/lib/Support/Windows/Signals.inc b/llvm/lib/Support/Windows/Signals.inc
index dad0fa3..648d6a5 100644
--- a/llvm/lib/Support/Windows/Signals.inc
+++ b/llvm/lib/Support/Windows/Signals.inc
@@ -354,8 +354,8 @@ namespace llvm {
/// Emulates hitting "retry" from an "abort, retry, ignore" CRT debug report
/// dialog. "retry" raises an exception which ultimately triggers our stack
/// dumper.
-static LLVM_ATTRIBUTE_UNUSED int
-AvoidMessageBoxHook(int ReportType, char *Message, int *Return) {
+[[maybe_unused]] static int AvoidMessageBoxHook(int ReportType, char *Message,
+ int *Return) {
// Set *Return to the retry code for the return value of _CrtDbgReport:
// http://msdn.microsoft.com/en-us/library/8hyw4sy7(v=vs.71).aspx
// This may also trigger just-in-time debugging via DebugBreak().
diff --git a/llvm/lib/TableGen/Main.cpp b/llvm/lib/TableGen/Main.cpp
index 42043f7..b1024a8 100644
--- a/llvm/lib/TableGen/Main.cpp
+++ b/llvm/lib/TableGen/Main.cpp
@@ -26,6 +26,7 @@
#include "llvm/Support/SMLoc.h"
#include "llvm/Support/SourceMgr.h"
#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/VirtualFileSystem.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/TableGen/Error.h"
#include "llvm/TableGen/Record.h"
@@ -128,6 +129,7 @@ int llvm::TableGenMain(const char *argv0,
// Record the location of the include directory so that the lexer can find
// it later.
SrcMgr.setIncludeDirs(IncludeDirs);
+ SrcMgr.setVirtualFileSystem(vfs::getRealFileSystem());
TGParser Parser(SrcMgr, MacroNames, Records, NoWarnOnUnusedTemplateArgs);
diff --git a/llvm/lib/TableGen/Parser.cpp b/llvm/lib/TableGen/Parser.cpp
index 2c3726a..db45054 100644
--- a/llvm/lib/TableGen/Parser.cpp
+++ b/llvm/lib/TableGen/Parser.cpp
@@ -9,6 +9,7 @@
#include "llvm/TableGen/Parser.h"
#include "TGParser.h"
#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/VirtualFileSystem.h"
#include "llvm/TableGen/Record.h"
using namespace llvm;
@@ -21,6 +22,7 @@ bool llvm::TableGenParseFile(SourceMgr &InputSrcMgr, RecordKeeper &Records) {
SrcMgr = SourceMgr();
SrcMgr.takeSourceBuffersFrom(InputSrcMgr);
SrcMgr.setIncludeDirs(InputSrcMgr.getIncludeDirs());
+ SrcMgr.setVirtualFileSystem(InputSrcMgr.getVirtualFileSystem());
SrcMgr.setDiagHandler(InputSrcMgr.getDiagHandler(),
InputSrcMgr.getDiagContext());
diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp
index 2ea3a24..afce803 100644
--- a/llvm/lib/TableGen/Record.cpp
+++ b/llvm/lib/TableGen/Record.cpp
@@ -1363,9 +1363,12 @@ const Init *BinOpInit::Fold(const Record *CurRec) const {
}
case LISTSPLAT: {
const auto *Value = dyn_cast<TypedInit>(LHS);
- const auto *Size = dyn_cast<IntInit>(RHS);
- if (Value && Size) {
- SmallVector<const Init *, 8> Args(Size->getValue(), Value);
+ const auto *Count = dyn_cast<IntInit>(RHS);
+ if (Value && Count) {
+ if (Count->getValue() < 0)
+ PrintFatalError(Twine("!listsplat count ") + Count->getAsString() +
+ " is negative");
+ SmallVector<const Init *, 8> Args(Count->getValue(), Value);
return ListInit::get(Args, Value->getType());
}
break;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 6965116..662d84b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1561,6 +1561,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_MUL, VT, Custom);
setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
@@ -1717,6 +1718,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Custom);
setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom);
setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom);
setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom);
@@ -7775,6 +7777,9 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
case ISD::VECREDUCE_FMAXIMUM:
case ISD::VECREDUCE_FMINIMUM:
return LowerVECREDUCE(Op, DAG);
+ case ISD::VECREDUCE_MUL:
+ case ISD::VECREDUCE_FMUL:
+ return LowerVECREDUCE_MUL(Op, DAG);
case ISD::ATOMIC_LOAD_AND:
return LowerATOMIC_LOAD_AND(Op, DAG);
case ISD::DYNAMIC_STACKALLOC:
@@ -16254,7 +16259,7 @@ SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
SplatVal > 1) {
SDValue Pg = getPredicateForScalableVector(DAG, DL, VT);
SDValue Res =
- DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, DL, VT, Pg, Op->getOperand(0),
+ DAG.getNode(AArch64ISD::ASRD_MERGE_OP1, DL, VT, Pg, Op->getOperand(0),
DAG.getTargetConstant(Log2_64(SplatVal), DL, MVT::i32));
if (Negated)
Res = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
@@ -16794,6 +16799,33 @@ SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
}
}
+SDValue AArch64TargetLowering::LowerVECREDUCE_MUL(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ SDValue Src = Op.getOperand(0);
+ EVT SrcVT = Src.getValueType();
+ assert(SrcVT.isScalableVector() && "Unexpected operand type!");
+
+ SDVTList SrcVTs = DAG.getVTList(SrcVT, SrcVT);
+ unsigned BaseOpc = ISD::getVecReduceBaseOpcode(Op.getOpcode());
+ SDValue Identity = DAG.getNeutralElement(BaseOpc, DL, SrcVT, Op->getFlags());
+
+ // Whilst we don't know the size of the vector we do know the maximum size so
+ // can perform a tree reduction with an identity vector, which means once we
+ // arrive at the result the remaining stages (when the vector is smaller than
+ // the maximum) have no affect.
+
+ unsigned Segments = AArch64::SVEMaxBitsPerVector / AArch64::SVEBitsPerBlock;
+ unsigned Stages = llvm::Log2_32(Segments * SrcVT.getVectorMinNumElements());
+
+ for (unsigned I = 0; I < Stages; ++I) {
+ Src = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, SrcVTs, Src, Identity);
+ Src = DAG.getNode(BaseOpc, DL, SrcVT, Src.getValue(0), Src.getValue(1));
+ }
+
+ return DAG.getExtractVectorElt(DL, Op.getValueType(), Src, 0);
+}
+
SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
SelectionDAG &DAG) const {
auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
@@ -18144,8 +18176,8 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
- if (Factor != 2 && Factor != 4) {
- LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
+ if (Factor != 2 && Factor != 3 && Factor != 4) {
+ LLVM_DEBUG(dbgs() << "Matching ld2, ld3 and ld4 patterns failed\n");
return false;
}
auto *LI = dyn_cast<LoadInst>(Load);
@@ -18223,8 +18255,8 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
Instruction *Store, Value *Mask,
ArrayRef<Value *> InterleavedValues) const {
unsigned Factor = InterleavedValues.size();
- if (Factor != 2 && Factor != 4) {
- LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n");
+ if (Factor != 2 && Factor != 3 && Factor != 4) {
+ LLVM_DEBUG(dbgs() << "Matching st2, st3 and st4 patterns failed\n");
return false;
}
StoreInst *SI = dyn_cast<StoreInst>(Store);
@@ -22942,7 +22974,7 @@ static SDValue performIntrinsicCombine(SDNode *N,
return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2));
case Intrinsic::aarch64_sve_asrd:
- return DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, SDLoc(N), N->getValueType(0),
+ return DAG.getNode(AArch64ISD::ASRD_MERGE_OP1, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2), N->getOperand(3));
case Intrinsic::aarch64_sve_cmphs:
if (!N->getOperand(2).getValueType().isFloatingPoint())
@@ -26196,9 +26228,10 @@ static SDValue performFlagSettingCombine(SDNode *N,
return DCI.CombineTo(N, Res, SDValue(N, 1));
}
- // Combine identical generic nodes into this node, re-using the result.
+ // Combine equivalent generic nodes into this node, re-using the result.
if (SDNode *Generic = DCI.DAG.getNodeIfExists(
- GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS}))
+ GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS},
+ /*AllowCommute=*/true))
DCI.CombineTo(Generic, SDValue(N, 0));
return SDValue();
@@ -30046,7 +30079,7 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
SDValue Res =
- DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, DL, ContainerVT, Pg, Op1, Op2);
+ DAG.getNode(AArch64ISD::ASRD_MERGE_OP1, DL, ContainerVT, Pg, Op1, Op2);
if (Negated)
Res = DAG.getNode(ISD::SUB, DL, ContainerVT,
DAG.getConstant(0, DL, ContainerVT), Res);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 00956fd..9495c9f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -752,6 +752,7 @@ private:
SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVECREDUCE_MUL(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerInlineDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
index 7322212..fe84193 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -233,6 +233,12 @@ def G_SDOT : AArch64GenericInstruction {
let hasSideEffects = 0;
}
+def G_USDOT : AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src1, type0:$src2, type0:$src3);
+ let hasSideEffects = 0;
+}
+
// Generic instruction for the BSP pseudo. It is expanded into BSP, which
// expands into BSL/BIT/BIF after register allocation.
def G_BSP : AArch64GenericInstruction {
@@ -278,6 +284,7 @@ def : GINodeEquiv<G_UADDLV, AArch64uaddlv>;
def : GINodeEquiv<G_UDOT, AArch64udot>;
def : GINodeEquiv<G_SDOT, AArch64sdot>;
+def : GINodeEquiv<G_USDOT, AArch64usdot>;
def : GINodeEquiv<G_EXTRACT_VECTOR_ELT, vector_extract>;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 30dfcf2b..12c600f 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -10600,6 +10600,9 @@ describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg,
Register DestReg = DestSrc->Destination->getReg();
Register SrcReg = DestSrc->Source->getReg();
+ if (!DestReg.isValid() || !SrcReg.isValid())
+ return std::nullopt;
+
auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
// If the described register is the destination, just return the source.
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
index b3c9656..343fd81 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
@@ -40,7 +40,11 @@ yaml::AArch64FunctionInfo::AArch64FunctionInfo(
getSVEStackSize(MFI, &llvm::AArch64FunctionInfo::getStackSizePPR)),
HasStackFrame(MFI.hasStackFrame()
? std::optional<bool>(MFI.hasStackFrame())
- : std::nullopt) {}
+ : std::nullopt),
+ HasStreamingModeChanges(
+ MFI.hasStreamingModeChanges()
+ ? std::optional<bool>(MFI.hasStreamingModeChanges())
+ : std::nullopt) {}
void yaml::AArch64FunctionInfo::mappingImpl(yaml::IO &YamlIO) {
MappingTraits<AArch64FunctionInfo>::mapping(YamlIO, *this);
@@ -55,6 +59,8 @@ void AArch64FunctionInfo::initializeBaseYamlFields(
YamlMFI.StackSizePPR.value_or(0));
if (YamlMFI.HasStackFrame)
setHasStackFrame(*YamlMFI.HasStackFrame);
+ if (YamlMFI.HasStreamingModeChanges)
+ setHasStreamingModeChanges(*YamlMFI.HasStreamingModeChanges);
}
static std::pair<bool, bool> GetSignReturnAddress(const Function &F) {
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index bd0a17d..d1832f4 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -645,6 +645,7 @@ struct AArch64FunctionInfo final : public yaml::MachineFunctionInfo {
std::optional<uint64_t> StackSizeZPR;
std::optional<uint64_t> StackSizePPR;
std::optional<bool> HasStackFrame;
+ std::optional<bool> HasStreamingModeChanges;
AArch64FunctionInfo() = default;
AArch64FunctionInfo(const llvm::AArch64FunctionInfo &MFI);
@@ -659,6 +660,7 @@ template <> struct MappingTraits<AArch64FunctionInfo> {
YamlIO.mapOptional("stackSizeZPR", MFI.StackSizeZPR);
YamlIO.mapOptional("stackSizePPR", MFI.StackSizePPR);
YamlIO.mapOptional("hasStackFrame", MFI.HasStackFrame);
+ YamlIO.mapOptional("hasStreamingModeChanges", MFI.HasStreamingModeChanges);
}
};
diff --git a/llvm/lib/Target/AArch64/AArch64PostCoalescerPass.cpp b/llvm/lib/Target/AArch64/AArch64PostCoalescerPass.cpp
index cdf2822..a90950d 100644
--- a/llvm/lib/Target/AArch64/AArch64PostCoalescerPass.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PostCoalescerPass.cpp
@@ -75,6 +75,10 @@ bool AArch64PostCoalescer::runOnMachineFunction(MachineFunction &MF) {
if (Src != Dst)
MRI->replaceRegWith(Dst, Src);
+ if (MI.getOperand(1).isUndef())
+ for (MachineOperand &MO : MRI->use_operands(Dst))
+ MO.setIsUndef();
+
// MI must be erased from the basic block before recalculating the live
// interval.
LIS->RemoveMachineInstrFromMaps(MI);
diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
index f110558..7e03b97 100644
--- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
@@ -1360,14 +1360,24 @@ void AArch64EpilogueEmitter::emitEpilogue() {
}
bool CombineSPBump = shouldCombineCSRLocalStackBump(NumBytes);
- // Assume we can't combine the last pop with the sp restore.
- bool CombineAfterCSRBump = false;
+
+ unsigned ProloguePopSize = PrologueSaveSize;
if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) {
+ // With CalleeSavesAboveFrameRecord ProloguePopSize is the amount of stack
+ // that needs to be popped until we reach the start of the SVE save area.
+ // The "FixedObject" stack occurs after the SVE area and must be popped
+ // later.
+ ProloguePopSize -= FixedObject;
AfterCSRPopSize += FixedObject;
- } else if (!CombineSPBump && PrologueSaveSize != 0) {
+ }
+
+ // Assume we can't combine the last pop with the sp restore.
+ if (!CombineSPBump && ProloguePopSize != 0) {
MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
while (Pop->getOpcode() == TargetOpcode::CFI_INSTRUCTION ||
- AArch64InstrInfo::isSEHInstruction(*Pop))
+ AArch64InstrInfo::isSEHInstruction(*Pop) ||
+ (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord &&
+ isPartOfSVECalleeSaves(Pop)))
Pop = std::prev(Pop);
// Converting the last ldp to a post-index ldp is valid only if the last
// ldp's offset is 0.
@@ -1377,18 +1387,27 @@ void AArch64EpilogueEmitter::emitEpilogue() {
// may clobber), convert it to a post-index ldp.
if (OffsetOp.getImm() == 0 && AfterCSRPopSize >= 0) {
convertCalleeSaveRestoreToSPPrePostIncDec(
- Pop, DL, PrologueSaveSize, EmitCFI, MachineInstr::FrameDestroy,
- PrologueSaveSize);
+ Pop, DL, ProloguePopSize, EmitCFI, MachineInstr::FrameDestroy,
+ ProloguePopSize);
+ } else if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) {
+ MachineBasicBlock::iterator AfterLastPop = std::next(Pop);
+ if (AArch64InstrInfo::isSEHInstruction(*AfterLastPop))
+ ++AfterLastPop;
+ // If not, and CalleeSavesAboveFrameRecord is enabled, deallocate
+ // callee-save non-SVE registers to move the stack pointer to the start of
+ // the SVE area.
+ emitFrameOffset(MBB, AfterLastPop, DL, AArch64::SP, AArch64::SP,
+ StackOffset::getFixed(ProloguePopSize), TII,
+ MachineInstr::FrameDestroy, false, NeedsWinCFI,
+ &HasWinCFI);
} else {
- // If not, make sure to emit an add after the last ldp.
+ // Otherwise, make sure to emit an add after the last ldp.
// We're doing this by transferring the size to be restored from the
// adjustment *before* the CSR pops to the adjustment *after* the CSR
// pops.
- AfterCSRPopSize += PrologueSaveSize;
- CombineAfterCSRBump = true;
+ AfterCSRPopSize += ProloguePopSize;
}
}
-
// Move past the restores of the callee-saved registers.
// If we plan on combining the sp bump of the local stack size and the callee
// save stack size, we might need to adjust the CSR save and restore offsets.
@@ -1419,6 +1438,17 @@ void AArch64EpilogueEmitter::emitEpilogue() {
--SEHEpilogueStartI;
}
+ // Determine the ranges of SVE callee-saves. This is done before emitting any
+ // code at the end of the epilogue (for Swift async), which can get in the way
+ // of finding SVE callee-saves with CalleeSavesAboveFrameRecord.
+ auto [PPR, ZPR] = getSVEStackFrameSizes();
+ auto [PPRRange, ZPRRange] = partitionSVECS(
+ MBB,
+ SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord
+ ? MBB.getFirstTerminator()
+ : FirstGPRRestoreI,
+ PPR.CalleeSavesSize, ZPR.CalleeSavesSize, /*IsEpilogue=*/true);
+
if (HasFP && AFI->hasSwiftAsyncContext())
emitSwiftAsyncContextFramePointer(EpilogueEndI, DL);
@@ -1441,14 +1471,6 @@ void AArch64EpilogueEmitter::emitEpilogue() {
NumBytes -= PrologueSaveSize;
assert(NumBytes >= 0 && "Negative stack allocation size!?");
- auto [PPR, ZPR] = getSVEStackFrameSizes();
- auto [PPRRange, ZPRRange] = partitionSVECS(
- MBB,
- SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord
- ? MBB.getFirstTerminator()
- : FirstGPRRestoreI,
- PPR.CalleeSavesSize, ZPR.CalleeSavesSize, /*IsEpilogue=*/true);
-
StackOffset SVECalleeSavesSize = ZPR.CalleeSavesSize + PPR.CalleeSavesSize;
StackOffset SVEStackSize =
SVECalleeSavesSize + PPR.LocalsSize + ZPR.LocalsSize;
@@ -1467,16 +1489,6 @@ void AArch64EpilogueEmitter::emitEpilogue() {
NeedsWinCFI, &HasWinCFI);
}
- // Deallocate callee-save non-SVE registers.
- emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
- StackOffset::getFixed(AFI->getCalleeSavedStackSize()), TII,
- MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
-
- // Deallocate fixed objects.
- emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
- StackOffset::getFixed(FixedObject), TII,
- MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
-
// Deallocate callee-save SVE registers.
emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
SVECalleeSavesSize, TII, MachineInstr::FrameDestroy, false,
@@ -1619,7 +1631,7 @@ void AArch64EpilogueEmitter::emitEpilogue() {
MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
StackOffset::getFixed(AfterCSRPopSize), TII, MachineInstr::FrameDestroy,
false, NeedsWinCFI, &HasWinCFI, EmitCFI,
- StackOffset::getFixed(CombineAfterCSRBump ? PrologueSaveSize : 0));
+ StackOffset::getFixed(AfterCSRPopSize - ArgumentStackToRestore));
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index bc6b931..98a128e 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -265,7 +265,7 @@ def SDT_AArch64Arith_Imm : SDTypeProfile<1, 3, [
SDTCVecEltisVT<1,i1>, SDTCisSameAs<0,2>
]>;
-def AArch64asrd_m1 : SDNode<"AArch64ISD::SRAD_MERGE_OP1", SDT_AArch64Arith_Imm>;
+def AArch64asrd_m1 : SDNode<"AArch64ISD::ASRD_MERGE_OP1", SDT_AArch64Arith_Imm>;
def AArch64urshri_p_node : SDNode<"AArch64ISD::URSHR_I_PRED", SDT_AArch64Arith_Imm>;
def AArch64urshri_p : PatFrags<(ops node:$op1, node:$op2, node:$op3),
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 12ddf47..53b00e8 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -273,7 +273,7 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
EpilogueVectorizationMinVF = 8;
MaxInterleaveFactor = 4;
ScatterOverhead = 13;
- LLVM_FALLTHROUGH;
+ [[fallthrough]];
case NeoverseN2:
case NeoverseN3:
PrefFunctionAlignment = Align(16);
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 2c3870c..636d4f8a 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -8217,6 +8217,8 @@ bool AArch64AsmParser::parseDataExpr(const MCExpr *&Res) {
Spec = AArch64::S_GOTPCREL;
else if (Identifier == "plt")
Spec = AArch64::S_PLT;
+ else if (Identifier == "funcinit")
+ Spec = AArch64::S_FUNCINIT;
}
if (Spec == AArch64::S_None)
return Error(Loc, "invalid relocation specifier");
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 9e2d698..05a4313 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1855,6 +1855,8 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
return LowerTriOp(AArch64::G_UDOT);
case Intrinsic::aarch64_neon_sdot:
return LowerTriOp(AArch64::G_SDOT);
+ case Intrinsic::aarch64_neon_usdot:
+ return LowerTriOp(AArch64::G_USDOT);
case Intrinsic::aarch64_neon_sqxtn:
return LowerUnaryOp(TargetOpcode::G_TRUNC_SSAT_S);
case Intrinsic::aarch64_neon_sqxtun:
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index a388216..892b8da 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -232,6 +232,8 @@ unsigned AArch64ELFObjectWriter::getRelocType(const MCFixup &Fixup,
}
if (RefKind == AArch64::S_AUTH || RefKind == AArch64::S_AUTHADDR)
return ELF::R_AARCH64_AUTH_ABS64;
+ if (RefKind == AArch64::S_FUNCINIT)
+ return ELF::R_AARCH64_FUNCINIT64;
return ELF::R_AARCH64_ABS64;
}
case AArch64::fixup_aarch64_add_imm12:
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index 2b5cf34..bc090c6 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -40,6 +40,7 @@ const MCAsmInfo::AtSpecifier ELFAtSpecifiers[] = {
{AArch64::S_GOT, "GOT"},
{AArch64::S_GOTPCREL, "GOTPCREL"},
{AArch64::S_PLT, "PLT"},
+ {AArch64::S_FUNCINIT, "FUNCINIT"},
};
const MCAsmInfo::AtSpecifier MachOAtSpecifiers[] = {
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
index 0dfa61b..f2acff5 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -164,6 +164,7 @@ enum {
// ELF relocation specifiers in data directives:
S_PLT = 0x400,
S_GOTPCREL,
+ S_FUNCINIT,
// Mach-O @ relocation specifiers:
S_MACHO_GOT,
diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
index 4749748..434ea67 100644
--- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
@@ -294,6 +294,12 @@ struct MachineSMEABI : public MachineFunctionPass {
MachineBasicBlock::iterator MBBI,
LiveRegs PhysLiveRegs);
+ /// Attempts to find an insertion point before \p Inst where the status flags
+ /// are not live. If \p Inst is `Block.Insts.end()` a point before the end of
+ /// the block is found.
+ std::pair<MachineBasicBlock::iterator, LiveRegs>
+ findStateChangeInsertionPoint(MachineBasicBlock &MBB, const BlockInfo &Block,
+ SmallVectorImpl<InstInfo>::const_iterator Inst);
void emitStateChange(EmitContext &, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI, ZAState From,
ZAState To, LiveRegs PhysLiveRegs);
@@ -337,6 +343,28 @@ private:
MachineRegisterInfo *MRI = nullptr;
};
+static LiveRegs getPhysLiveRegs(LiveRegUnits const &LiveUnits) {
+ LiveRegs PhysLiveRegs = LiveRegs::None;
+ if (!LiveUnits.available(AArch64::NZCV))
+ PhysLiveRegs |= LiveRegs::NZCV;
+ // We have to track W0 and X0 separately as otherwise things can get
+ // confused if we attempt to preserve X0 but only W0 was defined.
+ if (!LiveUnits.available(AArch64::W0))
+ PhysLiveRegs |= LiveRegs::W0;
+ if (!LiveUnits.available(AArch64::W0_HI))
+ PhysLiveRegs |= LiveRegs::W0_HI;
+ return PhysLiveRegs;
+}
+
+static void setPhysLiveRegs(LiveRegUnits &LiveUnits, LiveRegs PhysLiveRegs) {
+ if (PhysLiveRegs & LiveRegs::NZCV)
+ LiveUnits.addReg(AArch64::NZCV);
+ if (PhysLiveRegs & LiveRegs::W0)
+ LiveUnits.addReg(AArch64::W0);
+ if (PhysLiveRegs & LiveRegs::W0_HI)
+ LiveUnits.addReg(AArch64::W0_HI);
+}
+
FunctionInfo MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
assert((SMEFnAttrs.hasAgnosticZAInterface() || SMEFnAttrs.hasZT0State() ||
SMEFnAttrs.hasZAState()) &&
@@ -362,26 +390,13 @@ FunctionInfo MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
LiveRegUnits LiveUnits(*TRI);
LiveUnits.addLiveOuts(MBB);
- auto GetPhysLiveRegs = [&] {
- LiveRegs PhysLiveRegs = LiveRegs::None;
- if (!LiveUnits.available(AArch64::NZCV))
- PhysLiveRegs |= LiveRegs::NZCV;
- // We have to track W0 and X0 separately as otherwise things can get
- // confused if we attempt to preserve X0 but only W0 was defined.
- if (!LiveUnits.available(AArch64::W0))
- PhysLiveRegs |= LiveRegs::W0;
- if (!LiveUnits.available(AArch64::W0_HI))
- PhysLiveRegs |= LiveRegs::W0_HI;
- return PhysLiveRegs;
- };
-
- Block.PhysLiveRegsAtExit = GetPhysLiveRegs();
+ Block.PhysLiveRegsAtExit = getPhysLiveRegs(LiveUnits);
auto FirstTerminatorInsertPt = MBB.getFirstTerminator();
auto FirstNonPhiInsertPt = MBB.getFirstNonPHI();
for (MachineInstr &MI : reverse(MBB)) {
MachineBasicBlock::iterator MBBI(MI);
LiveUnits.stepBackward(MI);
- LiveRegs PhysLiveRegs = GetPhysLiveRegs();
+ LiveRegs PhysLiveRegs = getPhysLiveRegs(LiveUnits);
// The SMEStateAllocPseudo marker is added to a function if the save
// buffer was allocated in SelectionDAG. It marks the end of the
// allocation -- which is a safe point for this pass to insert any TPIDR2
@@ -476,6 +491,49 @@ MachineSMEABI::assignBundleZAStates(const EdgeBundles &Bundles,
return BundleStates;
}
+std::pair<MachineBasicBlock::iterator, LiveRegs>
+MachineSMEABI::findStateChangeInsertionPoint(
+ MachineBasicBlock &MBB, const BlockInfo &Block,
+ SmallVectorImpl<InstInfo>::const_iterator Inst) {
+ LiveRegs PhysLiveRegs;
+ MachineBasicBlock::iterator InsertPt;
+ if (Inst != Block.Insts.end()) {
+ InsertPt = Inst->InsertPt;
+ PhysLiveRegs = Inst->PhysLiveRegs;
+ } else {
+ InsertPt = MBB.getFirstTerminator();
+ PhysLiveRegs = Block.PhysLiveRegsAtExit;
+ }
+
+ if (!(PhysLiveRegs & LiveRegs::NZCV))
+ return {InsertPt, PhysLiveRegs}; // Nothing to do (no live flags).
+
+ // Find the previous state change. We can not move before this point.
+ MachineBasicBlock::iterator PrevStateChangeI;
+ if (Inst == Block.Insts.begin()) {
+ PrevStateChangeI = MBB.begin();
+ } else {
+ // Note: `std::prev(Inst)` is the previous InstInfo. We only create an
+ // InstInfo object for instructions that require a specific ZA state, so the
+ // InstInfo is the site of the previous state change in the block (which can
+ // be several MIs earlier).
+ PrevStateChangeI = std::prev(Inst)->InsertPt;
+ }
+
+ // Note: LiveUnits will only accurately track X0 and NZCV.
+ LiveRegUnits LiveUnits(*TRI);
+ setPhysLiveRegs(LiveUnits, PhysLiveRegs);
+ for (MachineBasicBlock::iterator I = InsertPt; I != PrevStateChangeI; --I) {
+ // Don't move before/into a call (which may have a state change before it).
+ if (I->getOpcode() == TII->getCallFrameDestroyOpcode() || I->isCall())
+ break;
+ LiveUnits.stepBackward(*I);
+ if (LiveUnits.available(AArch64::NZCV))
+ return {I, getPhysLiveRegs(LiveUnits)};
+ }
+ return {InsertPt, PhysLiveRegs};
+}
+
void MachineSMEABI::insertStateChanges(EmitContext &Context,
const FunctionInfo &FnInfo,
const EdgeBundles &Bundles,
@@ -490,10 +548,13 @@ void MachineSMEABI::insertStateChanges(EmitContext &Context,
CurrentState = InState;
for (auto &Inst : Block.Insts) {
- if (CurrentState != Inst.NeededState)
- emitStateChange(Context, MBB, Inst.InsertPt, CurrentState,
- Inst.NeededState, Inst.PhysLiveRegs);
- CurrentState = Inst.NeededState;
+ if (CurrentState != Inst.NeededState) {
+ auto [InsertPt, PhysLiveRegs] =
+ findStateChangeInsertionPoint(MBB, Block, &Inst);
+ emitStateChange(Context, MBB, InsertPt, CurrentState, Inst.NeededState,
+ PhysLiveRegs);
+ CurrentState = Inst.NeededState;
+ }
}
if (MBB.succ_empty())
@@ -501,9 +562,12 @@ void MachineSMEABI::insertStateChanges(EmitContext &Context,
ZAState OutState =
BundleStates[Bundles.getBundle(MBB.getNumber(), /*Out=*/true)];
- if (CurrentState != OutState)
- emitStateChange(Context, MBB, MBB.getFirstTerminator(), CurrentState,
- OutState, Block.PhysLiveRegsAtExit);
+ if (CurrentState != OutState) {
+ auto [InsertPt, PhysLiveRegs] =
+ findStateChangeInsertionPoint(MBB, Block, Block.Insts.end());
+ emitStateChange(Context, MBB, InsertPt, CurrentState, OutState,
+ PhysLiveRegs);
+ }
}
}
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
index dd6fa16..d71f728 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
+++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
@@ -130,6 +130,12 @@ SMECallAttrs::SMECallAttrs(const CallBase &CB, const AArch64TargetLowering *TLI)
if (auto *CalledFunction = CB.getCalledFunction())
CalledFn = SMEAttrs(*CalledFunction, TLI);
+ // An `invoke` of an agnostic ZA function may not return normally (it may
+ // resume in an exception block). In this case, it acts like a private ZA
+ // callee and may require a ZA save to be set up before it is called.
+ if (isa<InvokeInst>(CB))
+ CalledFn.set(SMEAttrs::ZA_State_Agnostic, /*Enable=*/false);
+
// FIXME: We probably should not allow SME attributes on direct calls but
// clang duplicates streaming mode attributes at each callsite.
assert((IsIndirect ||
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index dbe74b1..5700468 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -2394,15 +2394,19 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
(TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI) ||
TII->isTRANS(MI)))
- Result = true;
+ Result = !MI.mayLoadOrStore();
else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
- TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI))
- Result = true;
+ TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI)) {
+ // Some memory instructions may be marked as VALU (e.g. BUFFER_LOAD_*_LDS).
+ // For our purposes, these shall not be classified as VALU as this results
+ // in unexpected behavior.
+ Result = !MI.mayLoadOrStore();
+ }
else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
TII->isSALU(MI))
- Result = true;
+ Result = !MI.mayLoadOrStore();
else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) &&
TII->isMFMAorWMMA(MI))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index a44af5f..8ed4062 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -514,8 +514,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
MVT::i64, Custom);
setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
- setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i32,
- Legal);
+ setOperationAction({ISD::ABS, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX},
+ MVT::i32, Legal);
setOperationAction(
{ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF},
@@ -2833,8 +2833,8 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,
R = getMad(DAG, DL, VT, YH, CH, Mad1);
}
- const bool IsFiniteOnly = (Flags.hasNoNaNs() || Options.NoNaNsFPMath) &&
- (Flags.hasNoInfs() || Options.NoInfsFPMath);
+ const bool IsFiniteOnly =
+ (Flags.hasNoNaNs() || Options.NoNaNsFPMath) && Flags.hasNoInfs();
// TODO: Check if known finite from source value.
if (!IsFiniteOnly) {
@@ -3161,9 +3161,8 @@ SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
- const auto &Options = getTargetMachine().Options;
- if (!Flags.hasNoInfs() && !Options.NoInfsFPMath) {
+ if (!Flags.hasNoInfs()) {
SDValue OverflowCheckConst =
DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
SDValue Overflow =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index ee466ca..596a895 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -3575,7 +3575,7 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
const bool IsFiniteOnly =
(MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
- (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
+ MI.getFlag(MachineInstr::FmNoInfs);
if (!IsFiniteOnly) {
// Expand isfinite(x) => fabs(x) < inf
@@ -3864,9 +3864,7 @@ bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
R = B.buildSelect(Ty, Underflow, Zero, R);
- const auto &Options = MF.getTarget().Options;
-
- if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
+ if (!(Flags & MachineInstr::FmNoInfs)) {
auto OverflowCheckConst =
B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index bfe2c80..a67b12a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -901,6 +901,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}});
+ addRulesForGOpcs({G_READSTEADYCOUNTER}, Standard).Uni(S64, {{Sgpr64}, {}});
+
bool hasSALUFloat = ST->hasSALUFloatInsts();
addRulesForGOpcs({G_FADD}, Standard)
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index d0ad120..b841171 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -1488,6 +1488,12 @@ let AssemblerPredicate = isGFX12Plus in {
def : MnemonicAlias<"ds_load_tr_b64", "ds_load_tr8_b64">, Requires<[isGFX1250Plus]>;
def : MnemonicAlias<"ds_load_tr_b128", "ds_load_tr16_b128">, Requires<[isGFX1250Plus]>;
+// Additional aliases for ds load transpose instructions.
+def : MnemonicAlias<"ds_load_b64_tr_b8", "ds_load_tr8_b64">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"ds_load_b128_tr_b16", "ds_load_tr16_b128">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"ds_load_b64_tr_b4", "ds_load_tr4_b64">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"ds_load_b96_tr_b6", "ds_load_tr6_b96">, Requires<[isGFX125xOnly]>;
+
//===----------------------------------------------------------------------===//
// GFX11.
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index be62395..e3f3aba 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -513,8 +513,7 @@ void AMDGPUDisassembler::decodeImmOperands(MCInst &MI,
}
if (Imm == AMDGPU::EncValues::LITERAL_CONST) {
- Op = decodeLiteralConstant(
- Desc, OpDesc, OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_FP64);
+ Op = decodeLiteralConstant(Desc, OpDesc);
continue;
}
@@ -893,6 +892,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
// have EXEC as implicit destination. Issue a warning if encoding for
// vdst is not EXEC.
if ((MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3) &&
+ MCII->get(MI.getOpcode()).getNumDefs() == 0 &&
MCII->get(MI.getOpcode()).hasImplicitDefOfPhysReg(AMDGPU::EXEC)) {
auto ExecEncoding = MRI.getEncodingValue(AMDGPU::EXEC_LO);
if (Bytes_[0] != ExecEncoding)
@@ -1545,21 +1545,21 @@ AMDGPUDisassembler::decodeMandatoryLiteralConstant(unsigned Val) const {
MCOperand
AMDGPUDisassembler::decodeMandatoryLiteral64Constant(uint64_t Val) const {
if (HasLiteral) {
- if (Literal64 != Val)
+ if (Literal != Val)
return errOperand(Val, "More than one unique literal is illegal");
}
HasLiteral = true;
- Literal = Literal64 = Val;
+ Literal = Val;
- bool UseLit64 = Hi_32(Literal64) == 0;
+ bool UseLit64 = Hi_32(Literal) == 0;
return UseLit64 ? MCOperand::createExpr(AMDGPUMCExpr::createLit(
- LitModifier::Lit64, Literal64, getContext()))
- : MCOperand::createImm(Literal64);
+ LitModifier::Lit64, Literal, getContext()))
+ : MCOperand::createImm(Literal);
}
-MCOperand AMDGPUDisassembler::decodeLiteralConstant(const MCInstrDesc &Desc,
- const MCOperandInfo &OpDesc,
- bool ExtendFP64) const {
+MCOperand
+AMDGPUDisassembler::decodeLiteralConstant(const MCInstrDesc &Desc,
+ const MCOperandInfo &OpDesc) const {
// For now all literal constants are supposed to be unsigned integer
// ToDo: deal with signed/unsigned 64-bit integer constants
// ToDo: deal with float/double constants
@@ -1569,35 +1569,79 @@ MCOperand AMDGPUDisassembler::decodeLiteralConstant(const MCInstrDesc &Desc,
Twine(Bytes.size()));
}
HasLiteral = true;
- Literal = Literal64 = eatBytes<uint32_t>(Bytes);
- if (ExtendFP64)
- Literal64 <<= 32;
+ Literal = eatBytes<uint32_t>(Bytes);
}
- int64_t Val = ExtendFP64 ? Literal64 : Literal;
+ // For disassembling always assume all inline constants are available.
+ bool HasInv2Pi = true;
- bool CanUse64BitLiterals =
- STI.hasFeature(AMDGPU::Feature64BitLiterals) &&
- !(Desc.TSFlags & (SIInstrFlags::VOP3 | SIInstrFlags::VOP3P));
-
- bool UseLit64 = false;
- if (CanUse64BitLiterals) {
- if (OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
- OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_INT64)
- UseLit64 = false;
- else if (OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_FP64 ||
- OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_FP64 ||
- OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_AC_FP64)
- UseLit64 = Hi_32(Literal64) == 0;
+ // Invalid instruction codes may contain literals for inline-only
+ // operands, so we support them here as well.
+ int64_t Val = Literal;
+ bool UseLit = false;
+ switch (OpDesc.OperandType) {
+ default:
+ llvm_unreachable("Unexpected operand type!");
+ case AMDGPU::OPERAND_REG_IMM_BF16:
+ case AMDGPU::OPERAND_REG_INLINE_C_BF16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
+ UseLit = AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
+ break;
+ case AMDGPU::OPERAND_REG_IMM_V2BF16:
+ UseLit = AMDGPU::isInlinableLiteralV2BF16(Val);
+ break;
+ case AMDGPU::OPERAND_REG_IMM_FP16:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+ UseLit = AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
+ break;
+ case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ UseLit = AMDGPU::isInlinableLiteralV2F16(Val);
+ break;
+ case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
+ break;
+ case AMDGPU::OPERAND_REG_IMM_INT16:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+ UseLit = AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
+ break;
+ case AMDGPU::OPERAND_REG_IMM_V2INT16:
+ UseLit = AMDGPU::isInlinableLiteralV2I16(Val);
+ break;
+ case AMDGPU::OPERAND_REG_IMM_FP32:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
+ case AMDGPU::OPERAND_REG_IMM_INT32:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
+ case AMDGPU::OPERAND_REG_IMM_V2FP32:
+ case AMDGPU::OPERAND_REG_IMM_V2INT32:
+ case AMDGPU::OPERAND_KIMM32:
+ UseLit = AMDGPU::isInlinableLiteral32(Val, HasInv2Pi);
+ break;
+ case AMDGPU::OPERAND_REG_IMM_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
+ Val <<= 32;
+ break;
+ case AMDGPU::OPERAND_REG_IMM_INT64:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+ UseLit = AMDGPU::isInlinableLiteral64(Val, HasInv2Pi);
+ break;
+ case MCOI::OPERAND_REGISTER:
+ // TODO: Disassembling V_DUAL_FMAMK_F32_X_FMAMK_F32_gfx11 hits
+ // decoding a literal in a position of a register operand. Give
+ // it special handling in the caller, decodeImmOperands(), instead
+ // of quietly allowing it here.
+ break;
}
- return UseLit64 ? MCOperand::createExpr(AMDGPUMCExpr::createLit(
- LitModifier::Lit64, Val, getContext()))
- : MCOperand::createImm(Val);
+ return UseLit ? MCOperand::createExpr(AMDGPUMCExpr::createLit(
+ LitModifier::Lit, Val, getContext()))
+ : MCOperand::createImm(Val);
}
-MCOperand
-AMDGPUDisassembler::decodeLiteral64Constant(const MCInst &Inst) const {
+MCOperand AMDGPUDisassembler::decodeLiteral64Constant() const {
assert(STI.hasFeature(AMDGPU::Feature64BitLiterals));
if (!HasLiteral) {
@@ -1606,25 +1650,13 @@ AMDGPUDisassembler::decodeLiteral64Constant(const MCInst &Inst) const {
Twine(Bytes.size()));
}
HasLiteral = true;
- Literal64 = eatBytes<uint64_t>(Bytes);
- }
-
- bool UseLit64 = false;
- const MCInstrDesc &Desc = MCII->get(Inst.getOpcode());
- const MCOperandInfo &OpDesc = Desc.operands()[Inst.getNumOperands()];
- if (OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
- OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_INT64) {
- UseLit64 = false;
- } else {
- assert(OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_FP64 ||
- OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_FP64 ||
- OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_AC_FP64);
- UseLit64 = Hi_32(Literal64) == 0;
+ Literal = eatBytes<uint64_t>(Bytes);
}
+ bool UseLit64 = Hi_32(Literal) == 0;
return UseLit64 ? MCOperand::createExpr(AMDGPUMCExpr::createLit(
- LitModifier::Lit64, Literal64, getContext()))
- : MCOperand::createImm(Literal64);
+ LitModifier::Lit64, Literal, getContext()))
+ : MCOperand::createImm(Literal);
}
MCOperand AMDGPUDisassembler::decodeIntImmed(unsigned Imm) {
@@ -1913,7 +1945,7 @@ MCOperand AMDGPUDisassembler::decodeNonVGPRSrcOp(const MCInst &Inst,
return MCOperand::createImm(Val);
if (Val == LITERAL64_CONST && STI.hasFeature(AMDGPU::Feature64BitLiterals)) {
- return decodeLiteral64Constant(Inst);
+ return decodeLiteral64Constant();
}
switch (Width) {
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 2751857..d103d79 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -44,8 +44,7 @@ private:
const unsigned HwModeRegClass;
const unsigned TargetMaxInstBytes;
mutable ArrayRef<uint8_t> Bytes;
- mutable uint32_t Literal;
- mutable uint64_t Literal64;
+ mutable uint64_t Literal;
mutable bool HasLiteral;
mutable std::optional<bool> EnableWavefrontSize32;
unsigned CodeObjectVersion;
@@ -144,9 +143,8 @@ public:
MCOperand decodeMandatoryLiteralConstant(unsigned Imm) const;
MCOperand decodeMandatoryLiteral64Constant(uint64_t Imm) const;
MCOperand decodeLiteralConstant(const MCInstrDesc &Desc,
- const MCOperandInfo &OpDesc,
- bool ExtendFP64) const;
- MCOperand decodeLiteral64Constant(const MCInst &Inst) const;
+ const MCOperandInfo &OpDesc) const;
+ MCOperand decodeLiteral64Constant() const;
MCOperand decodeSrcOp(const MCInst &Inst, unsigned Width, unsigned Val) const;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 6de59be..8ea64d1 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -3711,6 +3711,12 @@ defm GLOBAL_LOAD_TR_B64_w32 : VFLAT_Real_AllAddr_gfx1250<0x058, "globa
defm GLOBAL_LOAD_TR4_B64 : VFLAT_Real_AllAddr_gfx1250<0x073>;
defm GLOBAL_LOAD_TR6_B96 : VFLAT_Real_AllAddr_gfx1250<0x074>;
+// Additional aliases for global load transpose instructions.
+def : MnemonicAlias<"global_load_b128_tr_b16", "global_load_tr16_b128">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"global_load_b64_tr_b8", "global_load_tr8_b64">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"global_load_b64_tr_b4", "global_load_tr4_b64">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"global_load_b96_tr_b6", "global_load_tr6_b96">, Requires<[isGFX125xOnly]>;
+
defm FLAT_ATOMIC_ADD_F64 : VFLAT_Real_Atomics_gfx1250<0x055>;
defm FLAT_ATOMIC_MIN_F64 : VFLAT_Real_Atomics_gfx1250<0x05b, "flat_atomic_min_num_f64">;
defm FLAT_ATOMIC_MAX_F64 : VFLAT_Real_Atomics_gfx1250<0x05c, "flat_atomic_max_num_f64">;
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 71494be..4e11c4f 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -14,6 +14,7 @@
#include "GCNRegPressure.h"
#include "AMDGPU.h"
#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/RegisterPressure.h"
using namespace llvm;
@@ -459,10 +460,14 @@ LaneBitmask llvm::getLiveLaneMask(const LiveInterval &LI, SlotIndex SI,
GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI,
const LiveIntervals &LIS,
- const MachineRegisterInfo &MRI) {
+ const MachineRegisterInfo &MRI,
+ GCNRegPressure::RegKind RegKind) {
GCNRPTracker::LiveRegSet LiveRegs;
for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
auto Reg = Register::index2VirtReg(I);
+ if (RegKind != GCNRegPressure::TOTAL_KINDS &&
+ GCNRegPressure::getRegKind(Reg, MRI) != RegKind)
+ continue;
if (!LIS.hasInterval(Reg))
continue;
auto LiveMask = getLiveLaneMask(Reg, SI, LIS, MRI);
@@ -986,3 +991,128 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) {
#undef PFX
}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void llvm::dumpMaxRegPressure(MachineFunction &MF,
+ GCNRegPressure::RegKind Kind,
+ LiveIntervals &LIS,
+ const MachineLoopInfo *MLI) {
+
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+ auto &OS = dbgs();
+ const char *RegName = GCNRegPressure::getName(Kind);
+
+ unsigned MaxNumRegs = 0;
+ const MachineInstr *MaxPressureMI = nullptr;
+ GCNUpwardRPTracker RPT(LIS);
+ for (const MachineBasicBlock &MBB : MF) {
+ RPT.reset(MRI, LIS.getSlotIndexes()->getMBBEndIdx(&MBB).getPrevSlot());
+ for (const MachineInstr &MI : reverse(MBB)) {
+ RPT.recede(MI);
+ unsigned NumRegs = RPT.getMaxPressure().getNumRegs(Kind);
+ if (NumRegs > MaxNumRegs) {
+ MaxNumRegs = NumRegs;
+ MaxPressureMI = &MI;
+ }
+ }
+ }
+
+ SlotIndex MISlot = LIS.getInstructionIndex(*MaxPressureMI);
+
+ // Max pressure can occur at either the early-clobber or register slot.
+ // Choose the maximum liveset between both slots. This is ugly but this is
+ // diagnostic code.
+ SlotIndex ECSlot = MISlot.getRegSlot(true);
+ SlotIndex RSlot = MISlot.getRegSlot(false);
+ GCNRPTracker::LiveRegSet ECLiveSet = getLiveRegs(ECSlot, LIS, MRI, Kind);
+ GCNRPTracker::LiveRegSet RLiveSet = getLiveRegs(RSlot, LIS, MRI, Kind);
+ unsigned ECNumRegs = getRegPressure(MRI, ECLiveSet).getNumRegs(Kind);
+ unsigned RNumRegs = getRegPressure(MRI, RLiveSet).getNumRegs(Kind);
+ GCNRPTracker::LiveRegSet *LiveSet =
+ ECNumRegs > RNumRegs ? &ECLiveSet : &RLiveSet;
+ SlotIndex MaxPressureSlot = ECNumRegs > RNumRegs ? ECSlot : RSlot;
+ assert(getRegPressure(MRI, *LiveSet).getNumRegs(Kind) == MaxNumRegs);
+
+ // Split live registers into single-def and multi-def sets.
+ GCNRegPressure SDefPressure, MDefPressure;
+ SmallVector<Register, 16> SDefRegs, MDefRegs;
+ for (auto [Reg, LaneMask] : *LiveSet) {
+ assert(GCNRegPressure::getRegKind(Reg, MRI) == Kind);
+ LiveInterval &LI = LIS.getInterval(Reg);
+ if (LI.getNumValNums() == 1 ||
+ (LI.hasSubRanges() &&
+ llvm::all_of(LI.subranges(), [](const LiveInterval::SubRange &SR) {
+ return SR.getNumValNums() == 1;
+ }))) {
+ SDefPressure.inc(Reg, LaneBitmask::getNone(), LaneMask, MRI);
+ SDefRegs.push_back(Reg);
+ } else {
+ MDefPressure.inc(Reg, LaneBitmask::getNone(), LaneMask, MRI);
+ MDefRegs.push_back(Reg);
+ }
+ }
+ unsigned SDefNumRegs = SDefPressure.getNumRegs(Kind);
+ unsigned MDefNumRegs = MDefPressure.getNumRegs(Kind);
+ assert(SDefNumRegs + MDefNumRegs == MaxNumRegs);
+
+ auto printLoc = [&](const MachineBasicBlock *MBB, SlotIndex SI) {
+ return Printable([&, MBB, SI](raw_ostream &OS) {
+ OS << SI << ':' << printMBBReference(*MBB);
+ if (MLI)
+ if (const MachineLoop *ML = MLI->getLoopFor(MBB))
+ OS << " (LoopHdr " << printMBBReference(*ML->getHeader())
+ << ", Depth " << ML->getLoopDepth() << ")";
+ });
+ };
+
+ auto PrintRegInfo = [&](Register Reg, LaneBitmask LiveMask) {
+ GCNRegPressure RegPressure;
+ RegPressure.inc(Reg, LaneBitmask::getNone(), LiveMask, MRI);
+ OS << " " << printReg(Reg, TRI) << ':'
+ << TRI->getRegClassName(MRI.getRegClass(Reg)) << ", LiveMask "
+ << PrintLaneMask(LiveMask) << " (" << RegPressure.getNumRegs(Kind) << ' '
+ << RegName << "s)\n";
+
+ // Use std::map to sort def/uses by SlotIndex.
+ std::map<SlotIndex, const MachineInstr *> Instrs;
+ for (const MachineInstr &MI : MRI.reg_nodbg_instructions(Reg)) {
+ Instrs[LIS.getInstructionIndex(MI).getRegSlot()] = &MI;
+ }
+
+ for (const auto &[SI, MI] : Instrs) {
+ OS << " ";
+ if (MI->definesRegister(Reg, TRI))
+ OS << "def ";
+ if (MI->readsRegister(Reg, TRI))
+ OS << "use ";
+ OS << printLoc(MI->getParent(), SI) << ": " << *MI;
+ }
+ };
+
+ OS << "\n*** Register pressure info (" << RegName << "s) for " << MF.getName()
+ << " ***\n";
+ OS << "Max pressure is " << MaxNumRegs << ' ' << RegName << "s at "
+ << printLoc(MaxPressureMI->getParent(), MaxPressureSlot) << ": "
+ << *MaxPressureMI;
+
+ OS << "\nLive registers with single definition (" << SDefNumRegs << ' '
+ << RegName << "s):\n";
+
+ // Sort SDefRegs by number of uses (smallest first)
+ llvm::sort(SDefRegs, [&](Register A, Register B) {
+ return std::distance(MRI.use_nodbg_begin(A), MRI.use_nodbg_end()) <
+ std::distance(MRI.use_nodbg_begin(B), MRI.use_nodbg_end());
+ });
+
+ for (const Register Reg : SDefRegs) {
+ PrintRegInfo(Reg, LiveSet->lookup(Reg));
+ }
+
+ OS << "\nLive registers with multiple definitions (" << MDefNumRegs << ' '
+ << RegName << "s):\n";
+ for (const Register Reg : MDefRegs) {
+ PrintRegInfo(Reg, LiveSet->lookup(Reg));
+ }
+}
+#endif
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 898d1ff..979a8b0 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -31,6 +31,12 @@ class SlotIndex;
struct GCNRegPressure {
enum RegKind { SGPR, VGPR, AGPR, AVGPR, TOTAL_KINDS };
+ static constexpr const char *getName(RegKind Kind) {
+ const char *Names[] = {"SGPR", "VGPR", "AGPR", "AVGPR"};
+ assert(Kind < TOTAL_KINDS);
+ return Names[Kind];
+ }
+
GCNRegPressure() {
clear();
}
@@ -41,6 +47,11 @@ struct GCNRegPressure {
void clear() { std::fill(&Value[0], &Value[ValueArraySize], 0); }
+ unsigned getNumRegs(RegKind Kind) const {
+ assert(Kind < TOTAL_KINDS);
+ return Value[Kind];
+ }
+
/// \returns the SGPR32 pressure
unsigned getSGPRNum() const { return Value[SGPR]; }
/// \returns the aggregated ArchVGPR32, AccVGPR32, and Pseudo AVGPR pressure
@@ -138,6 +149,12 @@ struct GCNRegPressure {
void dump() const;
+ static RegKind getRegKind(unsigned Reg, const MachineRegisterInfo &MRI) {
+ const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+ const SIRegisterInfo *STI = static_cast<const SIRegisterInfo *>(TRI);
+ return (RegKind)getRegKind(MRI.getRegClass(Reg), STI);
+ }
+
private:
static constexpr unsigned ValueArraySize = TOTAL_KINDS * 2;
@@ -294,8 +311,10 @@ public:
}
};
-GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, const LiveIntervals &LIS,
- const MachineRegisterInfo &MRI);
+GCNRPTracker::LiveRegSet
+getLiveRegs(SlotIndex SI, const LiveIntervals &LIS,
+ const MachineRegisterInfo &MRI,
+ GCNRegPressure::RegKind RegKind = GCNRegPressure::TOTAL_KINDS);
////////////////////////////////////////////////////////////////////////////////
// GCNUpwardRPTracker
@@ -428,9 +447,6 @@ LaneBitmask getLiveLaneMask(const LiveInterval &LI, SlotIndex SI,
const MachineRegisterInfo &MRI,
LaneBitmask LaneMaskFilter = LaneBitmask::getAll());
-GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, const LiveIntervals &LIS,
- const MachineRegisterInfo &MRI);
-
/// creates a map MachineInstr -> LiveRegSet
/// R - range of iterators on instructions
/// After - upon entry or exit of every instruction
@@ -524,6 +540,11 @@ public:
}
};
+LLVM_ABI void dumpMaxRegPressure(MachineFunction &MF,
+ GCNRegPressure::RegKind Kind,
+ LiveIntervals &LIS,
+ const MachineLoopInfo *MLI);
+
} // end namespace llvm
#endif // LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index bdc0810..9fbf9e5 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -69,6 +69,27 @@ static cl::opt<bool> GCNTrackers(
cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
cl::init(false));
+static cl::opt<unsigned> PendingQueueLimit(
+ "amdgpu-scheduler-pending-queue-limit", cl::Hidden,
+ cl::desc(
+ "Max (Available+Pending) size to inspect pending queue (0 disables)"),
+ cl::init(256));
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+#define DUMP_MAX_REG_PRESSURE
+static cl::opt<bool> PrintMaxRPRegUsageBeforeScheduler(
+ "amdgpu-print-max-reg-pressure-regusage-before-scheduler", cl::Hidden,
+ cl::desc("Print a list of live registers along with their def/uses at the "
+ "point of maximum register pressure before scheduling."),
+ cl::init(false));
+
+static cl::opt<bool> PrintMaxRPRegUsageAfterScheduler(
+ "amdgpu-print-max-reg-pressure-regusage-after-scheduler", cl::Hidden,
+ cl::desc("Print a list of live registers along with their def/uses at the "
+ "point of maximum register pressure after scheduling."),
+ cl::init(false));
+#endif
+
const unsigned ScheduleMetrics::ScaleFactor = 100;
GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
@@ -320,17 +341,52 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
}
}
+static bool shouldCheckPending(SchedBoundary &Zone,
+ const TargetSchedModel *SchedModel) {
+ bool HasBufferedModel =
+ SchedModel->hasInstrSchedModel() && SchedModel->getMicroOpBufferSize();
+ unsigned Combined = Zone.Available.size() + Zone.Pending.size();
+ return Combined <= PendingQueueLimit && HasBufferedModel;
+}
+
+static SUnit *pickOnlyChoice(SchedBoundary &Zone,
+ const TargetSchedModel *SchedModel) {
+ // pickOnlyChoice() releases pending instructions and checks for new hazards.
+ SUnit *OnlyChoice = Zone.pickOnlyChoice();
+ if (!shouldCheckPending(Zone, SchedModel) || Zone.Pending.empty())
+ return OnlyChoice;
+
+ return nullptr;
+}
+
+void GCNSchedStrategy::printCandidateDecision(const SchedCandidate &Current,
+ const SchedCandidate &Preferred) {
+ LLVM_DEBUG({
+ dbgs() << "Prefer:\t\t";
+ DAG->dumpNode(*Preferred.SU);
+
+ if (Current.SU) {
+ dbgs() << "Not:\t";
+ DAG->dumpNode(*Current.SU);
+ }
+
+ dbgs() << "Reason:\t\t";
+ traceCandidate(Preferred);
+ });
+}
+
// This function is mostly cut and pasted from
// GenericScheduler::pickNodeFromQueue()
void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
const CandPolicy &ZonePolicy,
const RegPressureTracker &RPTracker,
- SchedCandidate &Cand,
+ SchedCandidate &Cand, bool &IsPending,
bool IsBottomUp) {
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
unsigned SGPRPressure = 0;
unsigned VGPRPressure = 0;
+ IsPending = false;
if (DAG->isTrackingPressure()) {
if (!GCNTrackers) {
SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
@@ -343,8 +399,9 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
VGPRPressure = T->getPressure().getArchVGPRNum();
}
}
- ReadyQueue &Q = Zone.Available;
- for (SUnit *SU : Q) {
+ LLVM_DEBUG(dbgs() << "Available Q:\n");
+ ReadyQueue &AQ = Zone.Available;
+ for (SUnit *SU : AQ) {
SchedCandidate TryCand(ZonePolicy);
initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
@@ -356,27 +413,55 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
// Initialize resource delta if needed in case future heuristics query it.
if (TryCand.ResDelta == SchedResourceDelta())
TryCand.initResourceDelta(Zone.DAG, SchedModel);
+ LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
+ Cand.setBest(TryCand);
+ } else {
+ printCandidateDecision(TryCand, Cand);
+ }
+ }
+
+ if (!shouldCheckPending(Zone, SchedModel))
+ return;
+
+ LLVM_DEBUG(dbgs() << "Pending Q:\n");
+ ReadyQueue &PQ = Zone.Pending;
+ for (SUnit *SU : PQ) {
+
+ SchedCandidate TryCand(ZonePolicy);
+ initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
+ VGPRPressure, IsBottomUp);
+ // Pass SchedBoundary only when comparing nodes from the same boundary.
+ SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
+ tryPendingCandidate(Cand, TryCand, ZoneArg);
+ if (TryCand.Reason != NoCand) {
+ // Initialize resource delta if needed in case future heuristics query it.
+ if (TryCand.ResDelta == SchedResourceDelta())
+ TryCand.initResourceDelta(Zone.DAG, SchedModel);
+ LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
+ IsPending = true;
Cand.setBest(TryCand);
- LLVM_DEBUG(traceCandidate(Cand));
+ } else {
+ printCandidateDecision(TryCand, Cand);
}
}
}
// This function is mostly cut and pasted from
// GenericScheduler::pickNodeBidirectional()
-SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
+SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode,
+ bool &PickedPending) {
// Schedule as far as possible in the direction of no choice. This is most
// efficient, but also provides the best heuristics for CriticalPSets.
- if (SUnit *SU = Bot.pickOnlyChoice()) {
+ if (SUnit *SU = pickOnlyChoice(Bot, SchedModel)) {
IsTopNode = false;
return SU;
}
- if (SUnit *SU = Top.pickOnlyChoice()) {
+ if (SUnit *SU = pickOnlyChoice(Top, SchedModel)) {
IsTopNode = true;
return SU;
}
- // Set the bottom-up policy based on the state of the current bottom zone and
- // the instructions outside the zone, including the top zone.
+ // Set the bottom-up policy based on the state of the current bottom zone
+ // and the instructions outside the zone, including the top zone.
CandPolicy BotPolicy;
setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
// Set the top-down policy based on the state of the current top zone and
@@ -384,12 +469,14 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
CandPolicy TopPolicy;
setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
+ bool BotPending = false;
// See if BotCand is still valid (because we previously scheduled from Top).
LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
if (!BotCand.isValid() || BotCand.SU->isScheduled ||
BotCand.Policy != BotPolicy) {
BotCand.reset(CandPolicy());
pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand,
+ BotPending,
/*IsBottomUp=*/true);
assert(BotCand.Reason != NoCand && "failed to find the first candidate");
} else {
@@ -399,6 +486,7 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
SchedCandidate TCand;
TCand.reset(CandPolicy());
pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand,
+ BotPending,
/*IsBottomUp=*/true);
assert(TCand.SU == BotCand.SU &&
"Last pick result should correspond to re-picking right now");
@@ -406,12 +494,14 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
#endif
}
+ bool TopPending = false;
// Check if the top Q has a better candidate.
LLVM_DEBUG(dbgs() << "Picking from Top:\n");
if (!TopCand.isValid() || TopCand.SU->isScheduled ||
TopCand.Policy != TopPolicy) {
TopCand.reset(CandPolicy());
pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand,
+ TopPending,
/*IsBottomUp=*/false);
assert(TopCand.Reason != NoCand && "failed to find the first candidate");
} else {
@@ -421,6 +511,7 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
SchedCandidate TCand;
TCand.reset(CandPolicy());
pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
+ TopPending,
/*IsBottomUp=*/false);
assert(TCand.SU == TopCand.SU &&
"Last pick result should correspond to re-picking right now");
@@ -431,12 +522,21 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
// Pick best from BotCand and TopCand.
LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
dbgs() << "Bot Cand: "; traceCandidate(BotCand););
- SchedCandidate Cand = BotCand;
- TopCand.Reason = NoCand;
- tryCandidate(Cand, TopCand, nullptr);
- if (TopCand.Reason != NoCand) {
- Cand.setBest(TopCand);
+ SchedCandidate Cand = BotPending ? TopCand : BotCand;
+ SchedCandidate TryCand = BotPending ? BotCand : TopCand;
+ PickedPending = BotPending && TopPending;
+
+ TryCand.Reason = NoCand;
+ if (BotPending || TopPending) {
+ PickedPending |= tryPendingCandidate(Cand, TopCand, nullptr);
+ } else {
+ tryCandidate(Cand, TryCand, nullptr);
+ }
+
+ if (TryCand.Reason != NoCand) {
+ Cand.setBest(TryCand);
}
+
LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
IsTopNode = Cand.AtTop;
@@ -451,35 +551,55 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
return nullptr;
}
+ bool PickedPending;
SUnit *SU;
do {
+ PickedPending = false;
if (RegionPolicy.OnlyTopDown) {
- SU = Top.pickOnlyChoice();
+ SU = pickOnlyChoice(Top, SchedModel);
if (!SU) {
CandPolicy NoPolicy;
TopCand.reset(NoPolicy);
pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand,
+ PickedPending,
/*IsBottomUp=*/false);
assert(TopCand.Reason != NoCand && "failed to find a candidate");
SU = TopCand.SU;
}
IsTopNode = true;
} else if (RegionPolicy.OnlyBottomUp) {
- SU = Bot.pickOnlyChoice();
+ SU = pickOnlyChoice(Bot, SchedModel);
if (!SU) {
CandPolicy NoPolicy;
BotCand.reset(NoPolicy);
pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand,
+ PickedPending,
/*IsBottomUp=*/true);
assert(BotCand.Reason != NoCand && "failed to find a candidate");
SU = BotCand.SU;
}
IsTopNode = false;
} else {
- SU = pickNodeBidirectional(IsTopNode);
+ SU = pickNodeBidirectional(IsTopNode, PickedPending);
}
} while (SU->isScheduled);
+ if (PickedPending) {
+ unsigned ReadyCycle = IsTopNode ? SU->TopReadyCycle : SU->BotReadyCycle;
+ SchedBoundary &Zone = IsTopNode ? Top : Bot;
+ unsigned CurrentCycle = Zone.getCurrCycle();
+ if (ReadyCycle > CurrentCycle)
+ Zone.bumpCycle(ReadyCycle);
+
+ // FIXME: checkHazard() doesn't give information about which cycle the
+ // hazard will resolve so just keep bumping the cycle by 1. This could be
+ // made more efficient if checkHazard() returned more details.
+ while (Zone.checkHazard(SU))
+ Zone.bumpCycle(Zone.getCurrCycle() + 1);
+
+ Zone.releasePending();
+ }
+
if (SU->isTopReady())
Top.removeReady(SU);
if (SU->isBottomReady())
@@ -525,6 +645,47 @@ GCNSchedStageID GCNSchedStrategy::getNextStage() const {
return *std::next(CurrentStage);
}
+bool GCNSchedStrategy::tryPendingCandidate(SchedCandidate &Cand,
+ SchedCandidate &TryCand,
+ SchedBoundary *Zone) const {
+ // Initialize the candidate if needed.
+ if (!Cand.isValid()) {
+ TryCand.Reason = NodeOrder;
+ return true;
+ }
+
+ // Bias PhysReg Defs and copies to their uses and defined respectively.
+ if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
+ biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
+ return TryCand.Reason != NoCand;
+
+ // Avoid exceeding the target's limit.
+ if (DAG->isTrackingPressure() &&
+ tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
+ RegExcess, TRI, DAG->MF))
+ return TryCand.Reason != NoCand;
+
+ // Avoid increasing the max critical pressure in the scheduled region.
+ if (DAG->isTrackingPressure() &&
+ tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
+ TryCand, Cand, RegCritical, TRI, DAG->MF))
+ return TryCand.Reason != NoCand;
+
+ bool SameBoundary = Zone != nullptr;
+ if (SameBoundary) {
+ TryCand.initResourceDelta(DAG, SchedModel);
+ if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
+ TryCand, Cand, ResourceReduce))
+ return TryCand.Reason != NoCand;
+ if (tryGreater(TryCand.ResDelta.DemandedResources,
+ Cand.ResDelta.DemandedResources, TryCand, Cand,
+ ResourceDemand))
+ return TryCand.Reason != NoCand;
+ }
+
+ return false;
+}
+
GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
const MachineSchedContext *C, bool IsLegacyScheduler)
: GCNSchedStrategy(C) {
@@ -960,6 +1121,14 @@ void GCNScheduleDAGMILive::runSchedStages() {
RegionLiveOuts.buildLiveRegMap();
}
+#ifdef DUMP_MAX_REG_PRESSURE
+ if (PrintMaxRPRegUsageBeforeScheduler) {
+ dumpMaxRegPressure(MF, GCNRegPressure::VGPR, *LIS, MLI);
+ dumpMaxRegPressure(MF, GCNRegPressure::SGPR, *LIS, MLI);
+ LIS->dump();
+ }
+#endif
+
GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
while (S.advanceStage()) {
auto Stage = createSchedStage(S.getCurrentStage());
@@ -995,6 +1164,14 @@ void GCNScheduleDAGMILive::runSchedStages() {
Stage->finalizeGCNSchedStage();
}
+
+#ifdef DUMP_MAX_REG_PRESSURE
+ if (PrintMaxRPRegUsageAfterScheduler) {
+ dumpMaxRegPressure(MF, GCNRegPressure::VGPR, *LIS, MLI);
+ dumpMaxRegPressure(MF, GCNRegPressure::SGPR, *LIS, MLI);
+ LIS->dump();
+ }
+#endif
}
#ifndef NDEBUG
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 8ea4267..975781f 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -44,17 +44,32 @@ raw_ostream &operator<<(raw_ostream &OS, const GCNSchedStageID &StageID);
/// heuristics to determine excess/critical pressure sets.
class GCNSchedStrategy : public GenericScheduler {
protected:
- SUnit *pickNodeBidirectional(bool &IsTopNode);
+ SUnit *pickNodeBidirectional(bool &IsTopNode, bool &PickedPending);
void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
const RegPressureTracker &RPTracker,
- SchedCandidate &Cand, bool IsBottomUp);
+ SchedCandidate &Cand, bool &IsPending,
+ bool IsBottomUp);
void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop,
const RegPressureTracker &RPTracker,
const SIRegisterInfo *SRI, unsigned SGPRPressure,
unsigned VGPRPressure, bool IsBottomUp);
+ /// Evaluates instructions in the pending queue using a subset of scheduling
+ /// heuristics.
+ ///
+ /// Instructions that cannot be issued due to hardware constraints are placed
+ /// in the pending queue rather than the available queue, making them normally
+ /// invisible to scheduling heuristics. However, in certain scenarios (such as
+ /// avoiding register spilling), it may be beneficial to consider scheduling
+ /// these not-yet-ready instructions.
+ bool tryPendingCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
+ SchedBoundary *Zone) const;
+
+ void printCandidateDecision(const SchedCandidate &Current,
+ const SchedCandidate &Preferred);
+
std::vector<unsigned> Pressure;
std::vector<unsigned> MaxPressure;
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 64e34db..5f6d742 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -260,8 +260,12 @@ class NSAHelper {
}
class MIMGNSAHelper<int num_addrs,
- list<RegisterClass> addr_types=!listsplat(VGPR_32, num_addrs)>
- : NSAHelper<> {
+ list<RegisterOperand> addr_types_in=[]>
+ : NSAHelper<> {
+ list<RegisterOperand> addr_types =
+ !if(!empty(addr_types_in), !listsplat(VGPROp_32, num_addrs),
+ addr_types_in);
+
list<string> AddrAsmNames = !foreach(i, !range(num_addrs), "vaddr" # i);
let AddrIns = !dag(ins, addr_types, AddrAsmNames);
let AddrAsm = "[$" # !interleave(AddrAsmNames, ", $") # "]";
@@ -358,7 +362,7 @@ class MIMG_gfx11<int op, dag outs, string dns = "">
// Base class for all NSA MIMG instructions.
// Note that 1-dword addresses always use non-NSA variants.
class MIMG_nsa_gfx11<int op, dag outs, int num_addrs, string dns="",
- list<RegisterClass> addr_types=[],
+ list<RegisterOperand> addr_types=[],
RegisterOperand LastAddrRC = VGPROp_32>
: MIMG<outs, dns>, MIMGe_gfx11<op> {
let SubtargetPredicate = isGFX11Only;
@@ -378,7 +382,7 @@ class MIMG_nsa_gfx11<int op, dag outs, int num_addrs, string dns="",
}
class VIMAGE_gfx12<int op, dag outs, int num_addrs, string dns="",
- list<RegisterClass> addr_types=[]>
+ list<RegisterOperand> addr_types=[]>
: VIMAGE<outs, dns>, VIMAGEe<op> {
let SubtargetPredicate = isGFX12Plus;
let AssemblerPredicate = isGFX12Plus;
@@ -1521,12 +1525,12 @@ class MIMG_IntersectRay_Helper<bit Is64, bit IsA16, bit isDual, bit isBVH8> {
int VAddrDwords = !srl(Size, 5);
int GFX11PlusNSAAddrs = !if(IsA16, 4, 5);
- RegisterClass node_ptr_type = !if(Is64, VReg_64, VGPR_32);
- list<RegisterClass> GFX11PlusAddrTypes =
- !cond(isBVH8 : [node_ptr_type, VReg_64, VReg_96, VReg_96, VGPR_32],
- isDual : [node_ptr_type, VReg_64, VReg_96, VReg_96, VReg_64],
- IsA16 : [node_ptr_type, VGPR_32, VReg_96, VReg_96],
- true : [node_ptr_type, VGPR_32, VReg_96, VReg_96, VReg_96]);
+ RegisterOperand node_ptr_type = !if(Is64, VGPROp_64, VGPROp_32);
+ list<RegisterOperand> GFX11PlusAddrTypes =
+ !cond(isBVH8 : [node_ptr_type, VGPROp_64, VGPROp_96, VGPROp_96, VGPROp_32],
+ isDual : [node_ptr_type, VGPROp_64, VGPROp_96, VGPROp_96, VGPROp_64],
+ IsA16 : [node_ptr_type, VGPROp_32, VGPROp_96, VGPROp_96],
+ true : [node_ptr_type, VGPROp_32, VGPROp_96, VGPROp_96, VGPROp_96]);
}
class MIMG_IntersectRay_gfx10<mimgopc op, string opcode, RegisterOperand AddrRC>
@@ -1552,7 +1556,7 @@ class MIMG_IntersectRay_gfx11<mimgopc op, string opcode, RegisterOperand AddrRC>
}
class MIMG_IntersectRay_nsa_gfx11<mimgopc op, string opcode, int num_addrs,
- list<RegisterClass> addr_types>
+ list<RegisterOperand> addr_types>
: MIMG_nsa_gfx11<op.GFX11, (outs VReg_128:$vdata), num_addrs, "GFX11",
addr_types> {
let InOperandList = !con(nsah.AddrIns, (ins SReg_128_XNULL:$srsrc, A16:$a16));
@@ -1561,7 +1565,7 @@ class MIMG_IntersectRay_nsa_gfx11<mimgopc op, string opcode, int num_addrs,
class VIMAGE_IntersectRay_gfx12<mimgopc op, string opcode, int num_addrs,
bit isDual, bit isBVH8,
- list<RegisterClass> addr_types>
+ list<RegisterOperand> addr_types>
: VIMAGE_gfx12<op.GFX12, !if(!or(isDual, isBVH8),
(outs VReg_320:$vdata, VReg_96:$ray_origin_out,
VReg_96:$ray_dir_out),
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 80e985d..a2841c11 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -18168,7 +18168,7 @@ Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
return CacheLineAlign;
}
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
assert(N->getOpcode() == ISD::CopyFromReg);
do {
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 5e27b37..6dcbced 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1019,7 +1019,7 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
// SMEM and VMEM operations. So there will never be
// outstanding address translations for both SMEM and
// VMEM at the same time.
- setScoreLB(T, CurrScore - 1);
+ setScoreLB(T, getScoreUB(T) - 1);
PendingEvents &= ~(1 << OtherEvent);
}
for (const MachineOperand &Op : Inst.all_uses())
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index ec5c5bb3..50447f4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -865,22 +865,16 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
- if (DestReg == AMDGPU::VCC_LO) {
- if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {
- BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)
- .addReg(SrcReg, getKillRegState(KillSrc));
- } else {
+ if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
+ if (DestReg == AMDGPU::VCC_LO) {
// FIXME: Hack until VReg_1 removed.
assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
- .addImm(0)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ .addImm(0)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ return;
}
- return;
- }
-
- if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
return;
}
@@ -898,22 +892,16 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
- if (DestReg == AMDGPU::VCC) {
- if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
- BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
- .addReg(SrcReg, getKillRegState(KillSrc));
- } else {
+ if (!AMDGPU::SReg_64_EncodableRegClass.contains(SrcReg)) {
+ if (DestReg == AMDGPU::VCC) {
// FIXME: Hack until VReg_1 removed.
assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
- .addImm(0)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ .addImm(0)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ return;
}
- return;
- }
-
- if (!AMDGPU::SReg_64_EncodableRegClass.contains(SrcReg)) {
reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
return;
}
@@ -9084,6 +9072,67 @@ void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
MachineOperand &Src1 = Inst.getOperand(2);
const DebugLoc &DL = Inst.getDebugLoc();
+ if (ST.useRealTrue16Insts()) {
+ Register SrcReg0, SrcReg1;
+ if (!Src0.isReg() || !RI.isVGPR(MRI, Src0.getReg())) {
+ SrcReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg0).add(Src0);
+ } else {
+ SrcReg0 = Src0.getReg();
+ }
+
+ if (!Src1.isReg() || !RI.isVGPR(MRI, Src1.getReg())) {
+ SrcReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg1).add(Src1);
+ } else {
+ SrcReg1 = Src1.getReg();
+ }
+
+ bool isSrc0Reg16 = MRI.constrainRegClass(SrcReg0, &AMDGPU::VGPR_16RegClass);
+ bool isSrc1Reg16 = MRI.constrainRegClass(SrcReg1, &AMDGPU::VGPR_16RegClass);
+
+ auto NewMI = BuildMI(*MBB, Inst, DL, get(AMDGPU::REG_SEQUENCE), ResultReg);
+ switch (Inst.getOpcode()) {
+ case AMDGPU::S_PACK_LL_B32_B16:
+ NewMI
+ .addReg(SrcReg0, 0,
+ isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
+ .addImm(AMDGPU::lo16)
+ .addReg(SrcReg1, 0,
+ isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
+ .addImm(AMDGPU::hi16);
+ break;
+ case AMDGPU::S_PACK_LH_B32_B16:
+ NewMI
+ .addReg(SrcReg0, 0,
+ isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
+ .addImm(AMDGPU::lo16)
+ .addReg(SrcReg1, 0, AMDGPU::hi16)
+ .addImm(AMDGPU::hi16);
+ break;
+ case AMDGPU::S_PACK_HL_B32_B16:
+ NewMI.addReg(SrcReg0, 0, AMDGPU::hi16)
+ .addImm(AMDGPU::lo16)
+ .addReg(SrcReg1, 0,
+ isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
+ .addImm(AMDGPU::hi16);
+ break;
+ case AMDGPU::S_PACK_HH_B32_B16:
+ NewMI.addReg(SrcReg0, 0, AMDGPU::hi16)
+ .addImm(AMDGPU::lo16)
+ .addReg(SrcReg1, 0, AMDGPU::hi16)
+ .addImm(AMDGPU::hi16);
+ break;
+ default:
+ llvm_unreachable("unhandled s_pack_* instruction");
+ }
+
+ MachineOperand &Dest = Inst.getOperand(0);
+ MRI.replaceRegWith(Dest.getReg(), ResultReg);
+ addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
+ return;
+ }
+
switch (Inst.getOpcode()) {
case AMDGPU::S_PACK_LL_B32_B16: {
Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index e979eeb..df27ec1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -879,6 +879,11 @@ public:
MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
}
+ bool isMFMA(uint16_t Opcode) const {
+ return isMAI(Opcode) && Opcode != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
+ Opcode != AMDGPU::V_ACCVGPR_READ_B32_e64;
+ }
+
static bool isDOT(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::IsDOT;
}
@@ -895,6 +900,10 @@ public:
return isMFMA(MI) || isWMMA(MI) || isSWMMAC(MI);
}
+ bool isMFMAorWMMA(uint16_t Opcode) const {
+ return isMFMA(Opcode) || isWMMA(Opcode) || isSWMMAC(Opcode);
+ }
+
static bool isSWMMAC(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::IsSWMMAC;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index eac9fd4..27e5ee9c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3726,6 +3726,23 @@ def : GCNPat <
} // End foreach Ty = ...
} // End AddedComplexity = 1
+let True16Predicate = UseRealTrue16Insts in {
+def : GCNPat<
+ (i32 (DivergentBinFrag<or>
+ (i32 (zext i16:$src_lo)),
+ (i32 (bitconvert (v2i16 (build_vector (i16 0), (i16 VGPR_16:$src_hi)))))
+ )),
+ (REG_SEQUENCE VGPR_32, $src_lo, lo16, $src_hi, hi16)
+>;
+def : GCNPat<
+ (i32 (DivergentBinFrag<or>
+ (i32 (bitconvert (v2i16 (build_vector (i16 0), (i16 VGPR_16:$src_hi))))),
+ (i32 (zext i16:$src_lo))
+ )),
+ (REG_SEQUENCE VGPR_32, $src_lo, lo16, $src_hi, hi16)
+>;
+}
+
let True16Predicate = UseRealTrue16Insts in
def : GCNPat <
(v2i16 (DivergentBinFrag<build_vector> (i16 undef), (i16 (trunc i32:$src1)))),
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index c684f9e..7431e11 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -47,9 +47,6 @@ private:
const MachineBasicBlock &From,
const MachineBasicBlock &To) const;
bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
- // Check if the machine instruction being processed is a supported packed
- // instruction.
- bool isUnpackingSupportedInstr(MachineInstr &MI) const;
// Creates a list of packed instructions following an MFMA that are suitable
// for unpacking.
void collectUnpackingCandidates(MachineInstr &BeginMI,
@@ -454,23 +451,6 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
return true;
}
-// If support is extended to new operations, add tests in
-// llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir.
-bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const {
- if (!TII->isNeverCoissue(MI))
- return false;
- unsigned Opcode = MI.getOpcode();
- switch (Opcode) {
- case AMDGPU::V_PK_ADD_F32:
- case AMDGPU::V_PK_MUL_F32:
- case AMDGPU::V_PK_FMA_F32:
- return true;
- default:
- return false;
- }
- llvm_unreachable("Fully covered switch");
-}
-
bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) {
unsigned OpCode = MI.getOpcode();
Register DstReg = MI.getOperand(0).getReg();
@@ -612,10 +592,13 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
MachineInstr &Instr = *I;
+ uint16_t UnpackedOpCode = mapToUnpackedOpcode(Instr);
+ bool IsUnpackable =
+ !(UnpackedOpCode == std::numeric_limits<uint16_t>::max());
if (Instr.isMetaInstruction())
continue;
if ((Instr.isTerminator()) ||
- (TII->isNeverCoissue(Instr) && !isUnpackingSupportedInstr(Instr)) ||
+ (TII->isNeverCoissue(Instr) && !IsUnpackable) ||
(SIInstrInfo::modifiesModeRegister(Instr) &&
Instr.modifiesRegister(AMDGPU::EXEC, TRI)))
return;
@@ -639,7 +622,7 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
if (TRI->regsOverlap(MFMADef, InstrMO.getReg()))
return;
}
- if (!isUnpackingSupportedInstr(Instr))
+ if (!IsUnpackable)
continue;
if (canUnpackingClobberRegister(Instr))
@@ -654,7 +637,6 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
if (TotalCyclesBetweenCandidates < NumMFMACycles - 1)
InstrsToUnpack.insert(&Instr);
}
- return;
}
void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
@@ -681,7 +663,6 @@ void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
HiDstOp.setIsRenamable(DstOp.isRenamable());
I.eraseFromParent();
- return;
}
MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
@@ -689,8 +670,8 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
bool IsHiBits) {
MachineBasicBlock &MBB = *I.getParent();
const DebugLoc &DL = I.getDebugLoc();
- const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src0);
- const MachineOperand *SrcMO2 = TII->getNamedOperand(I, AMDGPU::OpName::src1);
+ const MachineOperand *SrcMO0 = TII->getNamedOperand(I, AMDGPU::OpName::src0);
+ const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src1);
Register DstReg = I.getOperand(0).getReg();
unsigned OpCode = I.getOpcode();
Register UnpackedDstReg = IsHiBits ? TRI->getSubReg(DstReg, AMDGPU::sub1)
@@ -704,15 +685,15 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
NewMI.addDef(UnpackedDstReg); // vdst
- addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO1);
- addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO2);
+ addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO0);
+ addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO1);
if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
- const MachineOperand *SrcMO3 =
+ const MachineOperand *SrcMO2 =
TII->getNamedOperand(I, AMDGPU::OpName::src2);
unsigned Src2Mods =
TII->getNamedOperand(I, AMDGPU::OpName::src2_modifiers)->getImm();
- addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO3);
+ addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO2);
}
NewMI.addImm(ClampVal); // clamp
// Packed instructions do not support output modifiers. safe to assign them 0
@@ -789,9 +770,13 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) {
// TODO: Fold this into previous block, if possible. Evaluate and handle any
// side effects.
+
+ // Perform the extra MF scans only for supported archs
+ if (!ST.hasGFX940Insts())
+ return Changed;
for (MachineBasicBlock &MBB : MF) {
- // Unpack packed instructions overlapped by MFMAs. This allows the compiler
- // to co-issue unpacked instructions with MFMA
+ // Unpack packed instructions overlapped by MFMAs. This allows the
+ // compiler to co-issue unpacked instructions with MFMA
auto SchedModel = TII->getSchedModel();
SetVector<MachineInstr *> InstrsToUnpack;
for (auto &MI : make_early_inc_range(MBB.instrs())) {
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index be1c883..ebd2e7e 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2356,7 +2356,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
AMDGPU::M0)
.add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask));
- LLVM_FALLTHROUGH;
+ [[fallthrough]];
}
case AMDGPU::SI_SPILL_V1024_SAVE:
case AMDGPU::SI_SPILL_V512_SAVE:
@@ -2446,7 +2446,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
AMDGPU::M0)
.add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask));
- LLVM_FALLTHROUGH;
+ [[fallthrough]];
}
case AMDGPU::SI_SPILL_V16_RESTORE:
case AMDGPU::SI_SPILL_V32_RESTORE:
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index a01a5fd..5e3195b 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1697,9 +1697,6 @@ LLVM_READNONE
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi);
LLVM_READNONE
-bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi);
-
-LLVM_READNONE
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi);
LLVM_READNONE
diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 9945ecc..0d7b6d1 100644
--- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -161,8 +161,8 @@ namespace {
friend bool operator<(const NEONLdStTableEntry &TE, unsigned PseudoOpc) {
return TE.PseudoOpc < PseudoOpc;
}
- friend bool LLVM_ATTRIBUTE_UNUSED operator<(unsigned PseudoOpc,
- const NEONLdStTableEntry &TE) {
+ [[maybe_unused]] friend bool operator<(unsigned PseudoOpc,
+ const NEONLdStTableEntry &TE) {
return PseudoOpc < TE.PseudoOpc;
}
};
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 67ea2dd..35e1127 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21287,21 +21287,28 @@ bool ARMTargetLowering::useLoadStackGuardNode(const Module &M) const {
}
void ARMTargetLowering::insertSSPDeclarations(Module &M) const {
+ // MSVC CRT provides functionalities for stack protection.
RTLIB::LibcallImpl SecurityCheckCookieLibcall =
getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
- if (SecurityCheckCookieLibcall == RTLIB::Unsupported)
- return TargetLowering::insertSSPDeclarations(M);
- // MSVC CRT has a global variable holding security cookie.
- M.getOrInsertGlobal("__security_cookie",
- PointerType::getUnqual(M.getContext()));
+ RTLIB::LibcallImpl SecurityCookieVar =
+ getLibcallImpl(RTLIB::STACK_CHECK_GUARD);
+ if (SecurityCheckCookieLibcall != RTLIB::Unsupported &&
+ SecurityCookieVar != RTLIB::Unsupported) {
+ // MSVC CRT has a global variable holding security cookie.
+ M.getOrInsertGlobal(getLibcallImplName(SecurityCookieVar),
+ PointerType::getUnqual(M.getContext()));
- // MSVC CRT has a function to validate security cookie.
- FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
- getLibcallImplName(SecurityCheckCookieLibcall),
- Type::getVoidTy(M.getContext()), PointerType::getUnqual(M.getContext()));
- if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
- F->addParamAttr(0, Attribute::AttrKind::InReg);
+ // MSVC CRT has a function to validate security cookie.
+ FunctionCallee SecurityCheckCookie =
+ M.getOrInsertFunction(getLibcallImplName(SecurityCheckCookieLibcall),
+ Type::getVoidTy(M.getContext()),
+ PointerType::getUnqual(M.getContext()));
+ if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
+ F->addParamAttr(0, Attribute::AttrKind::InReg);
+ }
+
+ TargetLowering::insertSSPDeclarations(M);
}
Function *ARMTargetLowering::getSSPStackGuardCheck(const Module &M) const {
diff --git a/llvm/lib/Target/CSKY/Disassembler/CSKYDisassembler.cpp b/llvm/lib/Target/CSKY/Disassembler/CSKYDisassembler.cpp
index 39e651d..8945ec3 100644
--- a/llvm/lib/Target/CSKY/Disassembler/CSKYDisassembler.cpp
+++ b/llvm/lib/Target/CSKY/Disassembler/CSKYDisassembler.cpp
@@ -166,7 +166,7 @@ static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, uint64_t RegNo,
}
// TODO
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
static DecodeStatus DecodesFPR128RegisterClass(MCInst &Inst, uint64_t RegNo,
uint64_t Address,
const MCDisassembler *Decoder) {
diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
index ca81d30..8ace2d2 100644
--- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
+++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
@@ -28,6 +28,7 @@
#include "llvm/Support/MD5.h"
#include "llvm/TargetParser/Triple.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <cstdint>
#include <optional>
using namespace llvm;
@@ -193,7 +194,12 @@ void DXContainerGlobals::addResourcesForPSV(Module &M, PSVRuntimeInfo &PSV) {
dxbc::PSV::v2::ResourceBindInfo BindInfo;
BindInfo.Type = Type;
BindInfo.LowerBound = Binding.LowerBound;
- BindInfo.UpperBound = Binding.LowerBound + Binding.Size - 1;
+ assert(Binding.Size == UINT32_MAX ||
+ (uint64_t)Binding.LowerBound + Binding.Size - 1 <= UINT32_MAX &&
+ "Resource range is too large");
+ BindInfo.UpperBound = (Binding.Size == UINT32_MAX)
+ ? UINT32_MAX
+ : Binding.LowerBound + Binding.Size - 1;
BindInfo.Space = Binding.Space;
BindInfo.Kind = static_cast<dxbc::PSV::ResourceKind>(Kind);
BindInfo.Flags = Flags;
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 228114c..44c4830 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -57,6 +57,7 @@ def ResBindTy : DXILOpParamType;
def ResPropsTy : DXILOpParamType;
def SplitDoubleTy : DXILOpParamType;
def BinaryWithCarryTy : DXILOpParamType;
+def DimensionsTy : DXILOpParamType;
class DXILOpClass;
@@ -901,6 +902,13 @@ def CheckAccessFullyMapped : DXILOp<71, checkAccessFullyMapped> {
let attributes = [Attributes<DXIL1_0, [ReadOnly]>];
}
+def GetDimensions : DXILOp<72, getDimensions> {
+ let Doc = "gets the dimensions of a buffer or texture";
+ let arguments = [HandleTy, Int32Ty];
+ let result = DimensionsTy;
+ let stages = [Stages<DXIL1_0, [all_stages]>];
+}
+
def Barrier : DXILOp<80, barrier> {
let Doc = "inserts a memory barrier in the shader";
let intrinsics = [
diff --git a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp
index 1aed8f9..944b2e6 100644
--- a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp
@@ -261,6 +261,12 @@ static StructType *getBinaryWithCarryType(LLVMContext &Context) {
return StructType::create({Int32Ty, Int1Ty}, "dx.types.i32c");
}
+static StructType *getDimensionsType(LLVMContext &Ctx) {
+ Type *Int32Ty = Type::getInt32Ty(Ctx);
+ return getOrCreateStructType("dx.types.Dimensions",
+ {Int32Ty, Int32Ty, Int32Ty, Int32Ty}, Ctx);
+}
+
static Type *getTypeFromOpParamType(OpParamType Kind, LLVMContext &Ctx,
Type *OverloadTy) {
switch (Kind) {
@@ -318,6 +324,8 @@ static Type *getTypeFromOpParamType(OpParamType Kind, LLVMContext &Ctx,
return getSplitDoubleType(Ctx);
case OpParamType::BinaryWithCarryTy:
return getBinaryWithCarryType(Ctx);
+ case OpParamType::DimensionsTy:
+ return getDimensionsType(Ctx);
}
llvm_unreachable("Invalid parameter kind");
return nullptr;
diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
index 610d8b6..e46a393 100644
--- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
@@ -627,6 +627,28 @@ public:
});
}
+ [[nodiscard]] bool lowerGetDimensionsX(Function &F) {
+ IRBuilder<> &IRB = OpBuilder.getIRB();
+ Type *Int32Ty = IRB.getInt32Ty();
+
+ return replaceFunction(F, [&](CallInst *CI) -> Error {
+ IRB.SetInsertPoint(CI);
+ Value *Handle =
+ createTmpHandleCast(CI->getArgOperand(0), OpBuilder.getHandleType());
+ Value *Undef = UndefValue::get(Int32Ty);
+
+ Expected<CallInst *> OpCall = OpBuilder.tryCreateOp(
+ OpCode::GetDimensions, {Handle, Undef}, CI->getName(), Int32Ty);
+ if (Error E = OpCall.takeError())
+ return E;
+ Value *Dim = IRB.CreateExtractValue(*OpCall, 0);
+
+ CI->replaceAllUsesWith(Dim);
+ CI->eraseFromParent();
+ return Error::success();
+ });
+ }
+
[[nodiscard]] bool lowerGetPointer(Function &F) {
// These should have already been handled in DXILResourceAccess, so we can
// just clean up the dead prototype.
@@ -934,6 +956,9 @@ public:
case Intrinsic::dx_resource_updatecounter:
HasErrors |= lowerUpdateCounter(F);
break;
+ case Intrinsic::dx_resource_getdimensions_x:
+ HasErrors |= lowerGetDimensionsX(F);
+ break;
case Intrinsic::ctpop:
HasErrors |= lowerCtpopToCountBits(F);
break;
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
index 82c43ff..26a8728 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
@@ -1165,12 +1165,15 @@ void DXILBitcodeWriter::writeValueSymbolTableForwardDecl() {}
/// Returns the bit offset to backpatch with the location of the real VST.
void DXILBitcodeWriter::writeModuleInfo() {
// Emit various pieces of data attached to a module.
- if (!M.getTargetTriple().empty())
- writeStringRecord(Stream, bitc::MODULE_CODE_TRIPLE,
- M.getTargetTriple().str(), 0 /*TODO*/);
- const std::string &DL = M.getDataLayoutStr();
- if (!DL.empty())
- writeStringRecord(Stream, bitc::MODULE_CODE_DATALAYOUT, DL, 0 /*TODO*/);
+
+ // We need to hardcode a triple and datalayout that's compatible with the
+ // historical DXIL triple and datalayout from DXC.
+ StringRef Triple = "dxil-ms-dx";
+ StringRef DL = "e-m:e-p:32:32-i1:8-i8:8-i16:32-i32:32-i64:64-"
+ "f16:32-f32:32-f64:64-n8:16:32:64";
+ writeStringRecord(Stream, bitc::MODULE_CODE_TRIPLE, Triple, 0 /*TODO*/);
+ writeStringRecord(Stream, bitc::MODULE_CODE_DATALAYOUT, DL, 0 /*TODO*/);
+
if (!M.getModuleInlineAsm().empty())
writeStringRecord(Stream, bitc::MODULE_CODE_ASM, M.getModuleInlineAsm(),
0 /*TODO*/);
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp
index 1eb03bf..725f2b1 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp
@@ -149,11 +149,6 @@ public:
std::string Data;
llvm::raw_string_ostream OS(Data);
- Triple OriginalTriple = M.getTargetTriple();
- // Set to DXIL triple when write to bitcode.
- // Only the output bitcode need to be DXIL triple.
- M.setTargetTriple(Triple("dxil-ms-dx"));
-
// Perform late legalization of lifetime intrinsics that would otherwise
// fail the Module Verifier if performed in an earlier pass
legalizeLifetimeIntrinsics(M);
@@ -165,9 +160,6 @@ public:
// not-so-legal legalizations
removeLifetimeIntrinsics(M);
- // Recover triple.
- M.setTargetTriple(OriginalTriple);
-
Constant *ModuleConstant =
ConstantDataArray::get(M.getContext(), arrayRefFromStringRef(Data));
auto *GV = new llvm::GlobalVariable(M, ModuleConstant->getType(), true,
diff --git a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index 974f653..3bd6ed4 100644
--- a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -66,6 +66,10 @@ public:
void remapInstruction(MCInst &Instr) const;
+ Expected<bool> onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes,
+ uint64_t Address) const override;
+
private:
bool makeBundle(ArrayRef<uint8_t> Bytes, uint64_t Address,
uint64_t &BytesToSkip, raw_ostream &CS) const;
@@ -567,6 +571,18 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(MCInst &MI, MCInst &MCB,
return Result;
}
+Expected<bool> HexagonDisassembler::onSymbolStart(SymbolInfoTy &Symbol,
+ uint64_t &Size,
+ ArrayRef<uint8_t> Bytes,
+ uint64_t Address) const {
+ // At the start of a symbol, force a fresh packet by resetting any
+ // in-progress bundle state. This prevents packets from straddling label
+ // boundaries when data (e.g. jump tables) appears in between.
+ Size = 0;
+ resetBundle();
+ return true;
+}
+
static DecodeStatus DecodeRegisterClass(MCInst &Inst, unsigned RegNo,
ArrayRef<MCPhysReg> Table) {
if (RegNo < Table.size()) {
@@ -667,11 +683,10 @@ static DecodeStatus DecodeHvxWRRegisterClass(MCInst &Inst, unsigned RegNo,
return DecodeRegisterClass(Inst, RegNo, HvxWRDecoderTable);
}
-LLVM_ATTRIBUTE_UNUSED // Suppress warning temporarily.
- static DecodeStatus
- DecodeHvxVQRRegisterClass(MCInst &Inst, unsigned RegNo,
- uint64_t /*Address*/,
- const MCDisassembler *Decoder) {
+[[maybe_unused]] // Suppress warning temporarily.
+static DecodeStatus DecodeHvxVQRRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t /*Address*/,
+ const MCDisassembler *Decoder) {
static const MCPhysReg HvxVQRDecoderTable[] = {
Hexagon::VQ0, Hexagon::VQ1, Hexagon::VQ2, Hexagon::VQ3,
Hexagon::VQ4, Hexagon::VQ5, Hexagon::VQ6, Hexagon::VQ7};
diff --git a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
index 52e6b0b..68f5312 100644
--- a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -174,8 +174,8 @@ namespace {
const TargetRegisterInfo *TRI;
};
- raw_ostream &operator<< (raw_ostream &OS, const PrintRegSet &P)
- LLVM_ATTRIBUTE_UNUSED;
+ [[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
+ const PrintRegSet &P);
raw_ostream &operator<< (raw_ostream &OS, const PrintRegSet &P) {
OS << '{';
for (unsigned R = P.RS.find_first(); R; R = P.RS.find_next(R))
diff --git a/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp b/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
index 14b6bb3..9087f9d 100644
--- a/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
@@ -272,15 +272,14 @@ namespace {
OS << *I << ' ' << **I << '\n';
}
- raw_ostream &operator<< (raw_ostream &OS,
- const NodeVect &S) LLVM_ATTRIBUTE_UNUSED;
+ [[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS, const NodeVect &S);
raw_ostream &operator<< (raw_ostream &OS, const NodeVect &S) {
dump_node_container(OS, S);
return OS;
}
- raw_ostream &operator<< (raw_ostream &OS,
- const NodeToUsesMap &M) LLVM_ATTRIBUTE_UNUSED;
+ [[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
+ const NodeToUsesMap &M);
raw_ostream &operator<< (raw_ostream &OS, const NodeToUsesMap &M){
for (const auto &I : M) {
const UseSet &Us = I.second;
@@ -914,9 +913,8 @@ namespace {
const NodeToValueMap &Map;
};
- raw_ostream &operator<< (raw_ostream &OS,
- const LocationAsBlock &Loc) LLVM_ATTRIBUTE_UNUSED ;
- raw_ostream &operator<< (raw_ostream &OS, const LocationAsBlock &Loc) {
+ [[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
+ const LocationAsBlock &Loc) {
for (const auto &I : Loc.Map) {
OS << I.first << " -> ";
if (BasicBlock *B = cast_or_null<BasicBlock>(I.second))
diff --git a/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp b/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
index 5dde47a..a3296e0 100644
--- a/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
@@ -419,8 +419,8 @@ namespace {
using HCE = HexagonConstExtenders;
- LLVM_ATTRIBUTE_UNUSED
- raw_ostream &operator<< (raw_ostream &OS, const OffsetRange &OR) {
+ [[maybe_unused]]
+ raw_ostream &operator<<(raw_ostream &OS, const OffsetRange &OR) {
if (OR.Min > OR.Max)
OS << '!';
OS << '[' << OR.Min << ',' << OR.Max << "]a" << unsigned(OR.Align)
@@ -435,8 +435,8 @@ namespace {
const HexagonRegisterInfo &HRI;
};
- LLVM_ATTRIBUTE_UNUSED
- raw_ostream &operator<< (raw_ostream &OS, const PrintRegister &P) {
+ [[maybe_unused]]
+ raw_ostream &operator<<(raw_ostream &OS, const PrintRegister &P) {
if (P.Rs.Reg != 0)
OS << printReg(P.Rs.Reg, &P.HRI, P.Rs.Sub);
else
@@ -451,8 +451,8 @@ namespace {
const HexagonRegisterInfo &HRI;
};
- LLVM_ATTRIBUTE_UNUSED
- raw_ostream &operator<< (raw_ostream &OS, const PrintExpr &P) {
+ [[maybe_unused]]
+ raw_ostream &operator<<(raw_ostream &OS, const PrintExpr &P) {
OS << "## " << (P.Ex.Neg ? "- " : "+ ");
if (P.Ex.Rs.Reg != 0)
OS << printReg(P.Ex.Rs.Reg, &P.HRI, P.Ex.Rs.Sub);
@@ -469,15 +469,15 @@ namespace {
const HexagonRegisterInfo &HRI;
};
- LLVM_ATTRIBUTE_UNUSED
- raw_ostream &operator<< (raw_ostream &OS, const PrintInit &P) {
+ [[maybe_unused]]
+ raw_ostream &operator<<(raw_ostream &OS, const PrintInit &P) {
OS << '[' << P.ExtI.first << ", "
<< PrintExpr(P.ExtI.second, P.HRI) << ']';
return OS;
}
- LLVM_ATTRIBUTE_UNUSED
- raw_ostream &operator<< (raw_ostream &OS, const HCE::ExtDesc &ED) {
+ [[maybe_unused]]
+ raw_ostream &operator<<(raw_ostream &OS, const HCE::ExtDesc &ED) {
assert(ED.OpNum != -1u);
const MachineBasicBlock &MBB = *ED.getOp().getParent()->getParent();
const MachineFunction &MF = *MBB.getParent();
@@ -493,8 +493,8 @@ namespace {
return OS;
}
- LLVM_ATTRIBUTE_UNUSED
- raw_ostream &operator<< (raw_ostream &OS, const HCE::ExtRoot &ER) {
+ [[maybe_unused]]
+ raw_ostream &operator<<(raw_ostream &OS, const HCE::ExtRoot &ER) {
switch (ER.Kind) {
case MachineOperand::MO_Immediate:
OS << "imm:" << ER.V.ImmVal;
@@ -527,8 +527,8 @@ namespace {
return OS;
}
- LLVM_ATTRIBUTE_UNUSED
- raw_ostream &operator<< (raw_ostream &OS, const HCE::ExtValue &EV) {
+ [[maybe_unused]]
+ raw_ostream &operator<<(raw_ostream &OS, const HCE::ExtValue &EV) {
OS << HCE::ExtRoot(EV) << " off:" << EV.Offset;
return OS;
}
@@ -540,8 +540,8 @@ namespace {
const HexagonRegisterInfo &HRI;
};
- LLVM_ATTRIBUTE_UNUSED
- raw_ostream &operator<< (raw_ostream &OS, const PrintIMap &P) {
+ [[maybe_unused]]
+ raw_ostream &operator<<(raw_ostream &OS, const PrintIMap &P) {
OS << "{\n";
for (const std::pair<const HCE::ExtenderInit, HCE::IndexList> &Q : P.IMap) {
OS << " " << PrintInit(Q.first, P.HRI) << " -> {";
diff --git a/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
index 14a7ae7..3900aac 100644
--- a/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
@@ -132,8 +132,7 @@ namespace {
const TargetRegisterInfo &TRI;
friend raw_ostream &operator<< (raw_ostream &OS, const PrintFP &P);
};
- raw_ostream &operator<<(raw_ostream &OS,
- const PrintFP &P) LLVM_ATTRIBUTE_UNUSED;
+ [[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS, const PrintFP &P);
raw_ostream &operator<<(raw_ostream &OS, const PrintFP &P) {
OS << "{ SplitB:" << PrintMB(P.FP.SplitB)
<< ", PredR:" << printReg(P.FP.PredR, &P.TRI)
diff --git a/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp b/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
index f9fdab4..9c81e96 100644
--- a/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
@@ -51,11 +51,11 @@ private:
const TargetRegisterInfo &TRI;
};
- raw_ostream &operator<< (raw_ostream &OS, const PrintRegister &PR)
- LLVM_ATTRIBUTE_UNUSED;
- raw_ostream &operator<< (raw_ostream &OS, const PrintRegister &PR) {
- return OS << printReg(PR.Reg.Reg, &PR.TRI, PR.Reg.SubReg);
- }
+[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
+ const PrintRegister &PR);
+raw_ostream &operator<<(raw_ostream &OS, const PrintRegister &PR) {
+ return OS << printReg(PR.Reg.Reg, &PR.TRI, PR.Reg.SubReg);
+}
class HexagonGenPredicate : public MachineFunctionPass {
public:
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
index 4d96cfa..c7a4f68 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
@@ -789,7 +789,7 @@ struct ShuffleMask {
}
};
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
raw_ostream &operator<<(raw_ostream &OS, const ShuffleMask &SM) {
SM.print(OS);
return OS;
diff --git a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
index 87d052b..e4c0a16 100644
--- a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
@@ -364,7 +364,7 @@ private:
const HexagonVectorCombine &HVC;
};
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::AddrInfo &AI) {
OS << "Inst: " << AI.Inst << " " << *AI.Inst << '\n';
OS << "Addr: " << *AI.Addr << '\n';
@@ -375,7 +375,7 @@ raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::AddrInfo &AI) {
return OS;
}
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::MoveGroup &MG) {
OS << "IsLoad:" << (MG.IsLoad ? "yes" : "no");
OS << ", IsHvx:" << (MG.IsHvx ? "yes" : "no") << '\n';
@@ -394,7 +394,7 @@ raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::MoveGroup &MG) {
return OS;
}
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
raw_ostream &operator<<(raw_ostream &OS,
const AlignVectors::ByteSpan::Block &B) {
OS << " @" << B.Pos << " [" << B.Seg.Start << ',' << B.Seg.Size << "] ";
@@ -408,7 +408,7 @@ raw_ostream &operator<<(raw_ostream &OS,
return OS;
}
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::ByteSpan &BS) {
OS << "ByteSpan[size=" << BS.size() << ", extent=" << BS.extent() << '\n';
for (const AlignVectors::ByteSpan::Block &B : BS)
diff --git a/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp b/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp
index fa8ae60..2ff5843 100644
--- a/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp
@@ -111,7 +111,7 @@ namespace {
friend raw_ostream &operator<< (raw_ostream &OS, const DepChain &D);
};
- LLVM_ATTRIBUTE_UNUSED
+ [[maybe_unused]]
raw_ostream &operator<<(raw_ostream &OS, const DepChain &D) {
const ChainOfDependences &CD = D.Chain;
int ChainSize = CD.size();
@@ -144,7 +144,7 @@ namespace {
bool isDefined() { return Inst2Replace != nullptr; }
};
- LLVM_ATTRIBUTE_UNUSED
+ [[maybe_unused]]
raw_ostream &operator<<(raw_ostream &OS, const ReuseValue &RU) {
OS << "** ReuseValue ***\n";
OS << "Instruction to Replace: " << *(RU.Inst2Replace) << "\n";
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
index ca98269..e3094b4 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
@@ -275,7 +275,7 @@ namespace HexagonII {
INST_ICLASS_ALU32_3 = 0xf0000000
};
- LLVM_ATTRIBUTE_UNUSED
+ [[maybe_unused]]
static unsigned getMemAccessSizeInBytes(MemAccessSize S) {
switch (S) {
case ByteAccess: return 1;
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index bfea50e..6b48a21 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -422,12 +422,12 @@ static MCTargetStreamer *createHexagonNullTargetStreamer(MCStreamer &S) {
return new HexagonTargetStreamer(S);
}
-static void LLVM_ATTRIBUTE_UNUSED clearFeature(MCSubtargetInfo* STI, uint64_t F) {
+[[maybe_unused]] static void clearFeature(MCSubtargetInfo *STI, uint64_t F) {
if (STI->hasFeature(F))
STI->ToggleFeature(F);
}
-static bool LLVM_ATTRIBUTE_UNUSED checkFeature(MCSubtargetInfo* STI, uint64_t F) {
+[[maybe_unused]] static bool checkFeature(MCSubtargetInfo *STI, uint64_t F) {
return STI->hasFeature(F);
}
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 7ddf996..f7deeaf 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -632,7 +632,7 @@ SDValue LoongArchTargetLowering::lowerConstantFP(SDValue Op,
case MVT::f32: {
SDValue NewVal = DAG.getConstant(INTVal, DL, MVT::i32);
if (Subtarget.is64Bit())
- NewVal = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, NewVal);
+ NewVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, NewVal);
return DAG.getNode(Subtarget.is64Bit() ? LoongArchISD::MOVGR2FR_W_LA64
: LoongArchISD::MOVGR2FR_W,
DL, VT, NewVal);
diff --git a/llvm/lib/Target/Mips/MipsFastISel.cpp b/llvm/lib/Target/Mips/MipsFastISel.cpp
index 1ce8d7e3..df0c8c1 100644
--- a/llvm/lib/Target/Mips/MipsFastISel.cpp
+++ b/llvm/lib/Target/Mips/MipsFastISel.cpp
@@ -264,9 +264,10 @@ public:
} // end anonymous namespace
-static bool CC_Mips(unsigned ValNo, MVT ValVT, MVT LocVT,
- CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- Type *OrigTy, CCState &State) LLVM_ATTRIBUTE_UNUSED;
+[[maybe_unused]] static bool CC_Mips(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+ CCState &State);
static bool CC_MipsO32_FP32(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index 7f1ff45..2fd7327 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -3176,9 +3176,10 @@ static bool CC_MipsO32_FP64(unsigned ValNo, MVT ValVT, MVT LocVT,
F64Regs);
}
-static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
- CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
- Type *OrigTy, CCState &State) LLVM_ATTRIBUTE_UNUSED;
+[[maybe_unused]] static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+ CCState &State);
#include "MipsGenCallingConv.inc"
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index bef4868..7e7ee75 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -280,6 +280,10 @@ static unsigned getTcgen05LdOpcode(unsigned IID, bool enablePack) {
}
void NVPTXDAGToDAGISel::SelectTcgen05Ld(SDNode *N, bool hasOffset) {
+ if (!Subtarget->hasTcgen05InstSupport())
+ report_fatal_error(
+ "tcgen05.ld is not supported on this architecture variant");
+
SDLoc DL(N);
unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
@@ -2136,6 +2140,10 @@ static unsigned getTcgen05StOpcode(unsigned IID, bool enableUnpack) {
}
void NVPTXDAGToDAGISel::SelectTcgen05St(SDNode *N, bool hasOffset) {
+ if (!Subtarget->hasTcgen05InstSupport())
+ report_fatal_error(
+ "tcgen05.st is not supported on this architecture variant");
+
SDLoc DL(N);
unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index a1fb665..2f1a7ad 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -233,7 +233,7 @@ getVectorLoweringShape(EVT VectorEVT, const NVPTXSubtarget &STI,
// target supports 256-bit loads/stores
if (!CanLowerTo256Bit)
return std::nullopt;
- LLVM_FALLTHROUGH;
+ [[fallthrough]];
case MVT::v2i8:
case MVT::v2i64:
case MVT::v2f64:
@@ -248,7 +248,7 @@ getVectorLoweringShape(EVT VectorEVT, const NVPTXSubtarget &STI,
// global and the target supports 256-bit loads/stores.
if (!CanLowerTo256Bit)
return std::nullopt;
- LLVM_FALLTHROUGH;
+ [[fallthrough]];
case MVT::v2i16: // <1 x i16x2>
case MVT::v2f16: // <1 x f16x2>
case MVT::v2bf16: // <1 x bf16x2>
@@ -270,7 +270,7 @@ getVectorLoweringShape(EVT VectorEVT, const NVPTXSubtarget &STI,
// target supports 256-bit loads/stores
if (!CanLowerTo256Bit)
return std::nullopt;
- LLVM_FALLTHROUGH;
+ [[fallthrough]];
case MVT::v2f32: // <1 x f32x2>
case MVT::v4f32: // <2 x f32x2>
case MVT::v2i32: // <1 x i32x2>
@@ -749,7 +749,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setTruncStoreAction(VT, MVT::i1, Expand);
}
- // Disable generations of extload/truncstore for v2i16/v2i8. The generic
+ // Disable generations of extload/truncstore for v2i32/v2i16/v2i8. The generic
// expansion for these nodes when they are unaligned is incorrect if the
// type is a vector.
//
@@ -757,7 +757,11 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// TargetLowering::expandUnalignedLoad/Store.
setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16,
MVT::v2i8, Expand);
+ setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i32,
+ {MVT::v2i8, MVT::v2i16}, Expand);
setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
+ setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
+ setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
// Register custom handling for illegal type loads/stores. We'll try to custom
// lower almost all illegal types and logic in the lowering will discard cases
@@ -6749,7 +6753,7 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
case AtomicRMWInst::BinOp::Xchg:
if (BitWidth == 128)
return AtomicExpansionKind::None;
- LLVM_FALLTHROUGH;
+ [[fallthrough]];
case AtomicRMWInst::BinOp::And:
case AtomicRMWInst::BinOp::Or:
case AtomicRMWInst::BinOp::Xor:
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 6c14cf0..dfde0cc 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -101,6 +101,22 @@ def PrmtMode : Operand<i32> {
// NVPTX Instruction Predicate Definitions
//===----------------------------------------------------------------------===//
+// Checks PTX version and family-specific and architecture-specific SM versions.
+// For example, sm_100{f/a} and any future variants in the same family will match
+// for any PTX version greater than or equal to `PTXVersion`.
+class PTXWithFamilySMs<int PTXVersion, list<int> SMVersions> :
+ Predicate<"Subtarget->hasPTXWithFamilySMs(" # PTXVersion # ", {" #
+ !interleave(SMVersions, ", ") # "})">;
+
+// Checks PTX version and architecture-specific SM versions.
+// For example, sm_100{a} will match for any PTX version
+// greater than or equal to `PTXVersion`.
+class PTXWithAccelSMs<int PTXVersion, list<int> SMVersions> :
+ Predicate<"Subtarget->hasPTXWithAccelSMs(" # PTXVersion # ", {" #
+ !interleave(SMVersions, ", ") # "})">;
+
+// Helper predicate to call a subtarget method.
+class callSubtarget<string SubtargetMethod> : Predicate<"Subtarget->" # SubtargetMethod # "()">;
def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">;
def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index a8b854f..22cf3a7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -5103,8 +5103,8 @@ let Predicates = [hasSM<90>, hasPTX<78>] in {
def EXIT : NullaryInst<"exit", int_nvvm_exit>;
// Tcgen05 intrinsics
-let isConvergent = true, Predicates = [hasTcgen05Instructions] in {
-
+let isConvergent = true in {
+let Predicates = [callSubtarget<"hasTcgen05InstSupport">] in {
multiclass TCGEN05_ALLOC_INTR<string AS, string num, Intrinsic Intr> {
def "" : BasicNVPTXInst<(outs),
(ins ADDR:$dst, B32:$ncols),
@@ -5156,15 +5156,6 @@ defm TCGEN05_COMMIT_CG2 : TCGEN05_COMMIT_INTR<"", "2">;
defm TCGEN05_COMMIT_S64_CG1 : TCGEN05_COMMIT_INTR<"shared", "1">;
defm TCGEN05_COMMIT_S64_CG2 : TCGEN05_COMMIT_INTR<"shared", "2">;
-multiclass TCGEN05_SHIFT_INTR<string num, Intrinsic Intr> {
- def "" : BasicNVPTXInst<(outs),
- (ins ADDR:$tmem_addr),
- "tcgen05.shift.cta_group::" # num # ".down",
- [(Intr addr:$tmem_addr)]>;
-}
-defm TCGEN05_SHIFT_CG1: TCGEN05_SHIFT_INTR<"1", int_nvvm_tcgen05_shift_down_cg1>;
-defm TCGEN05_SHIFT_CG2: TCGEN05_SHIFT_INTR<"2", int_nvvm_tcgen05_shift_down_cg2>;
-
multiclass TCGEN05_CP_INTR<string shape, string src_fmt, string mc = ""> {
defvar dst_fmt = !if(!eq(src_fmt, ""), "", ".b8x16");
defvar fmt_asm = StrJoin<".", [dst_fmt, src_fmt]>.ret;
@@ -5195,9 +5186,22 @@ foreach src_fmt = ["", "b6x16_p32", "b4x16_p64"] in {
defm TCGEN05_CP_64x128_2 # src_fmt : TCGEN05_CP_INTR<"64x128b", src_fmt, "warpx2::01_23">;
defm TCGEN05_CP_32x128 # src_fmt : TCGEN05_CP_INTR<"32x128b", src_fmt, "warpx4">;
}
+} // Predicates
+
+let Predicates = [callSubtarget<"hasTcgen05ShiftSupport">] in {
+multiclass TCGEN05_SHIFT_INTR<string num, Intrinsic Intr> {
+ def "" : BasicNVPTXInst<(outs),
+ (ins ADDR:$tmem_addr),
+ "tcgen05.shift.cta_group::" # num # ".down",
+ [(Intr addr:$tmem_addr)]>;
+}
+defm TCGEN05_SHIFT_CG1: TCGEN05_SHIFT_INTR<"1", int_nvvm_tcgen05_shift_down_cg1>;
+defm TCGEN05_SHIFT_CG2: TCGEN05_SHIFT_INTR<"2", int_nvvm_tcgen05_shift_down_cg2>;
+} // Predicates
+
} // isConvergent
-let hasSideEffects = 1, Predicates = [hasTcgen05Instructions] in {
+let hasSideEffects = 1, Predicates = [callSubtarget<"hasTcgen05InstSupport">] in {
def tcgen05_fence_before_thread_sync: NullaryInst<
"tcgen05.fence::before_thread_sync", int_nvvm_tcgen05_fence_before_thread_sync>;
@@ -5231,8 +5235,7 @@ class TCGEN05_LDST_REGINFO<int Veclen> {
//
class TCGEN05_LD_INST<string Shape, int Num, bit Pack> :
- NVPTXInst<(outs), (ins), "?", []>,
- Requires<[hasTcgen05Instructions]> {
+ NVPTXInst<(outs), (ins), "?", []> {
TCGEN05_LDST_REGINFO Info = TCGEN05_LDST_REGINFO<
NVVM_TCGEN05_LDST_ACCESS_SIZE<Shape, Num>.veclen>;
@@ -5256,8 +5259,7 @@ class TCGEN05_LD_INST<string Shape, int Num, bit Pack> :
//
class TCGEN05_ST_INST<string Shape, int Num, bit Unpack> :
- NVPTXInst<(outs), (ins), "?", []>,
- Requires<[hasTcgen05Instructions]> {
+ NVPTXInst<(outs), (ins), "?", []> {
TCGEN05_LDST_REGINFO Info = TCGEN05_LDST_REGINFO<
NVVM_TCGEN05_LDST_ACCESS_SIZE<Shape, Num>.veclen>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
index c548967..989be50 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -72,6 +72,40 @@ const SelectionDAGTargetInfo *NVPTXSubtarget::getSelectionDAGInfo() const {
return TSInfo.get();
}
+bool NVPTXSubtarget::hasPTXWithFamilySMs(unsigned PTXVersion,
+ ArrayRef<unsigned> SMVersions) const {
+ unsigned PTXVer = getPTXVersion();
+ if (!hasFamilySpecificFeatures() || PTXVer < PTXVersion)
+ return false;
+
+ unsigned SMVer = getSmVersion();
+ return llvm::any_of(SMVersions, [&](unsigned SM) {
+ // sm_101 is a different family, never group it with sm_10x.
+ if (SMVer == 101 || SM == 101)
+ return SMVer == SM &&
+ // PTX 9.0 and later renamed sm_101 to sm_110, so sm_101 is not
+ // supported.
+ !(PTXVer >= 90 && SMVer == 101);
+
+ return getSmFamilyVersion() == SM / 10 && SMVer >= SM;
+ });
+}
+
+bool NVPTXSubtarget::hasPTXWithAccelSMs(unsigned PTXVersion,
+ ArrayRef<unsigned> SMVersions) const {
+ unsigned PTXVer = getPTXVersion();
+ if (!hasArchAccelFeatures() || PTXVer < PTXVersion)
+ return false;
+
+ unsigned SMVer = getSmVersion();
+ return llvm::any_of(SMVersions, [&](unsigned SM) {
+ return SMVer == SM &&
+ // PTX 9.0 and later renamed sm_101 to sm_110, so sm_101 is not
+ // supported.
+ !(PTXVer >= 90 && SMVer == 101);
+ });
+}
+
bool NVPTXSubtarget::allowFP16Math() const {
return hasFP16Math() && NoF16Math == false;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index e81c56b..194dbdc 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -73,6 +73,18 @@ public:
const SelectionDAGTargetInfo *getSelectionDAGInfo() const override;
+ // Checks PTX version and family-specific and architecture-specific SM
+ // versions. For example, sm_100{f/a} and any future variants in the same
+ // family will match for any PTX version greater than or equal to
+ // `PTXVersion`.
+ bool hasPTXWithFamilySMs(unsigned PTXVersion,
+ ArrayRef<unsigned> SMVersions) const;
+ // Checks PTX version and architecture-specific SM versions.
+ // For example, sm_100{a} will match for any PTX version greater than or equal
+ // to `PTXVersion`.
+ bool hasPTXWithAccelSMs(unsigned PTXVersion,
+ ArrayRef<unsigned> SMVersions) const;
+
bool has256BitVectorLoadStore(unsigned AS) const {
return SmVersion >= 100 && PTXVersion >= 88 &&
AS == NVPTXAS::ADDRESS_SPACE_GLOBAL;
@@ -127,6 +139,27 @@ public:
return HasTcgen05 && PTXVersion >= MinPTXVersion;
}
+ // Checks following instructions support:
+ // - tcgen05.ld/st
+ // - tcgen05.alloc/dealloc/relinquish
+ // - tcgen05.cp
+ // - tcgen05.fence/wait
+ // - tcgen05.commit
+ bool hasTcgen05InstSupport() const {
+ // sm_101 renamed to sm_110 in PTX 9.0
+ return hasPTXWithFamilySMs(90, {100, 110}) ||
+ hasPTXWithFamilySMs(88, {100, 101}) ||
+ hasPTXWithAccelSMs(86, {100, 101});
+ }
+
+ // Checks tcgen05.shift instruction support.
+ bool hasTcgen05ShiftSupport() const {
+ // sm_101 renamed to sm_110 in PTX 9.0
+ return hasPTXWithAccelSMs(90, {100, 110, 103}) ||
+ hasPTXWithAccelSMs(88, {100, 101, 103}) ||
+ hasPTXWithAccelSMs(86, {100, 101});
+ }
+
bool hasTcgen05MMAScaleInputDImm() const {
return FullSmVersion == 1003 && PTXVersion >= 86;
}
@@ -158,6 +191,7 @@ public:
bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; }
unsigned int getFullSmVersion() const { return FullSmVersion; }
unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
+ unsigned int getSmFamilyVersion() const { return getFullSmVersion() / 100; }
// GPUs with "a" suffix have architecture-accelerated features that are
// supported on the specified architecture only, hence such targets do not
// follow the onion layer model. hasArchAccelFeatures() allows distinguishing
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 944a1e2..8bf0d11 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -9702,6 +9702,10 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
}
return SDV;
}
+ // Recognize build vector patterns to emit VSX vector instructions
+ // instead of loading value from memory.
+ if (SDValue VecPat = combineBVLoadsSpecialValue(Op, DAG))
+ return VecPat;
}
// Check if this is a splat of a constant value.
APInt APSplatBits, APSplatUndef;
@@ -15696,6 +15700,142 @@ combineElementTruncationToVectorTruncation(SDNode *N,
return SDValue();
}
+// LXVKQ instruction load VSX vector with a special quadword value
+// based on an immediate value. This helper method returns the details of the
+// match as a tuple of {LXVKQ unsigned IMM Value, right_shift_amount}
+// to help generate the LXVKQ instruction and the subsequent shift instruction
+// required to match the original build vector pattern.
+
+// LXVKQPattern: {LXVKQ unsigned IMM Value, right_shift_amount}
+using LXVKQPattern = std::tuple<uint32_t, uint8_t>;
+
+static std::optional<LXVKQPattern> getPatternInfo(const APInt &FullVal) {
+
+ // LXVKQ instruction loads the Quadword value:
+ // 0x8000_0000_0000_0000_0000_0000_0000_0000 when imm = 0b10000
+ static const APInt BasePattern = APInt(128, 0x8000000000000000ULL) << 64;
+ static const uint32_t Uim = 16;
+
+ // Check for direct LXVKQ match (no shift needed)
+ if (FullVal == BasePattern)
+ return std::make_tuple(Uim, uint8_t{0});
+
+ // Check if FullValue is 1 (the result of the base pattern >> 127)
+ if (FullVal == APInt(128, 1))
+ return std::make_tuple(Uim, uint8_t{127});
+
+ return std::nullopt;
+}
+
+/// Combine vector loads to a single load (using lxvkq) or splat with shift of a
+/// constant (xxspltib + vsrq) by recognising patterns in the Build Vector.
+/// LXVKQ instruction load VSX vector with a special quadword value based on an
+/// immediate value. if UIM=0b10000 then LXVKQ loads VSR[32×TX+T] with value
+/// 0x8000_0000_0000_0000_0000_0000_0000_0000.
+/// This can be used to inline the build vector constants that have the
+/// following patterns:
+///
+/// 0x8000_0000_0000_0000_0000_0000_0000_0000 (MSB set pattern)
+/// 0x0000_0000_0000_0000_0000_0000_0000_0001 (LSB set pattern)
+/// MSB pattern can directly loaded using LXVKQ while LSB is loaded using a
+/// combination of splatting and right shift instructions.
+
+SDValue PPCTargetLowering::combineBVLoadsSpecialValue(SDValue Op,
+ SelectionDAG &DAG) const {
+
+ assert((Op.getNode() && Op.getOpcode() == ISD::BUILD_VECTOR) &&
+ "Expected a BuildVectorSDNode in combineBVLoadsSpecialValue");
+
+ // This transformation is only supported if we are loading either a byte,
+ // halfword, word, or doubleword.
+ EVT VT = Op.getValueType();
+ if (!(VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v4i32 ||
+ VT == MVT::v2i64))
+ return SDValue();
+
+ LLVM_DEBUG(llvm::dbgs() << "\ncombineBVLoadsSpecialValue: Build vector ("
+ << VT.getEVTString() << "): ";
+ Op->dump());
+
+ unsigned NumElems = VT.getVectorNumElements();
+ unsigned ElemBits = VT.getScalarSizeInBits();
+
+ bool IsLittleEndian = DAG.getDataLayout().isLittleEndian();
+
+ // Check for Non-constant operand in the build vector.
+ for (const SDValue &Operand : Op.getNode()->op_values()) {
+ if (!isa<ConstantSDNode>(Operand))
+ return SDValue();
+ }
+
+ // Assemble build vector operands as a 128-bit register value
+ // We need to reconstruct what the 128-bit register pattern would be
+ // that produces this vector when interpreted with the current endianness
+ APInt FullVal = APInt::getZero(128);
+
+ for (unsigned Index = 0; Index < NumElems; ++Index) {
+ auto *C = cast<ConstantSDNode>(Op.getOperand(Index));
+
+ // Get element value as raw bits (zero-extended)
+ uint64_t ElemValue = C->getZExtValue();
+
+ // Mask to element size to ensure we only get the relevant bits
+ if (ElemBits < 64)
+ ElemValue &= ((1ULL << ElemBits) - 1);
+
+ // Calculate bit position for this element in the 128-bit register
+ unsigned BitPos =
+ (IsLittleEndian) ? (Index * ElemBits) : (128 - (Index + 1) * ElemBits);
+
+ // Create APInt for the element value and shift it to correct position
+ APInt ElemAPInt(128, ElemValue);
+ ElemAPInt <<= BitPos;
+
+ // Place the element value at the correct bit position
+ FullVal |= ElemAPInt;
+ }
+
+ if (FullVal.isZero() || FullVal.isAllOnes())
+ return SDValue();
+
+ if (auto UIMOpt = getPatternInfo(FullVal)) {
+ const auto &[Uim, ShiftAmount] = *UIMOpt;
+ SDLoc Dl(Op);
+
+ // Generate LXVKQ instruction if the shift amount is zero.
+ if (ShiftAmount == 0) {
+ SDValue UimVal = DAG.getTargetConstant(Uim, Dl, MVT::i32);
+ SDValue LxvkqInstr =
+ SDValue(DAG.getMachineNode(PPC::LXVKQ, Dl, VT, UimVal), 0);
+ LLVM_DEBUG(llvm::dbgs()
+ << "combineBVLoadsSpecialValue: Instruction Emitted ";
+ LxvkqInstr.dump());
+ return LxvkqInstr;
+ }
+
+ assert(ShiftAmount == 127 && "Unexpected lxvkq shift amount value");
+
+ // The right shifted pattern can be constructed using a combination of
+ // XXSPLTIB and VSRQ instruction. VSRQ uses the shift amount from the lower
+ // 7 bits of byte 15. This can be specified using XXSPLTIB with immediate
+ // value 255.
+ SDValue ShiftAmountVec =
+ SDValue(DAG.getMachineNode(PPC::XXSPLTIB, Dl, MVT::v4i32,
+ DAG.getTargetConstant(255, Dl, MVT::i32)),
+ 0);
+ // Generate appropriate right shift instruction
+ SDValue ShiftVec = SDValue(
+ DAG.getMachineNode(PPC::VSRQ, Dl, VT, ShiftAmountVec, ShiftAmountVec),
+ 0);
+ LLVM_DEBUG(llvm::dbgs()
+ << "\n combineBVLoadsSpecialValue: Instruction Emitted ";
+ ShiftVec.dump());
+ return ShiftVec;
+ }
+ // No patterns matched for build vectors.
+ return SDValue();
+}
+
/// Reduce the number of loads when building a vector.
///
/// Building a vector out of multiple loads can be converted to a load
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 59f3387..880aca7 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -1472,6 +1472,9 @@ namespace llvm {
combineElementTruncationToVectorTruncation(SDNode *N,
DAGCombinerInfo &DCI) const;
+ SDValue combineBVLoadsSpecialValue(SDValue Operand,
+ SelectionDAG &DAG) const;
+
/// lowerToVINSERTH - Return the SDValue if this VECTOR_SHUFFLE can be
/// handled by the VINSERTH instruction introduced in ISA 3.0. This is
/// essentially any shuffle of v8i16 vectors that just inserts one element
diff --git a/llvm/lib/Target/PowerPC/PPCInstrP10.td b/llvm/lib/Target/PowerPC/PPCInstrP10.td
index 2384959..2d8c633 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrP10.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrP10.td
@@ -2404,6 +2404,190 @@ multiclass XXEvalTernarySelectOr<ValueType Vt> {
126>;
}
+// =============================================================================
+// XXEVAL Ternary Pattern Multiclass: XXEvalTernarySelectNor
+// This class matches the equivalent Ternary Operation: A ? f(B,C) : NOR(B,C)
+// and emit the corresponding xxeval instruction with the imm value.
+//
+// The patterns implement xxeval vector select operations where:
+// - A is the selector vector
+// - f(B,C) is the "true" case op in set {B, C, AND(B,C), XOR(B,C), NOT(C),
+// NOT(B), NAND(B,C)}
+// - C is the "false" case op NOR(B,C)
+// =============================================================================
+multiclass XXEvalTernarySelectNor<ValueType Vt>{
+ // Pattern: (A ? AND(B,C) : NOR(B,C)) XXEVAL immediate value: 129
+ def : XXEvalPattern<
+ Vt, (vselect Vt:$vA, (VAnd Vt:$vB, Vt:$vC), (VNor Vt:$vB, Vt:$vC)),
+ 129>;
+
+ // Pattern: (A ? B : NOR(B,C)) XXEVAL immediate value: 131
+ def : XXEvalPattern<Vt, (vselect Vt:$vA, Vt:$vB, (VNor Vt:$vB, Vt:$vC)),131>;
+
+ // Pattern: (A ? C : NOR(B,C)) XXEVAL immediate value: 133
+ def : XXEvalPattern<
+ Vt, (vselect Vt:$vA, Vt:$vC, (VNor Vt:$vB, Vt:$vC)),
+ 133>;
+
+ // Pattern: (A ? XOR(B,C) : NOR(B,C)) XXEVAL immediate value: 134
+ def : XXEvalPattern<
+ Vt, (vselect Vt:$vA, (VXor Vt:$vB, Vt:$vC), (VNor Vt:$vB, Vt:$vC)),
+ 134>;
+
+ // Pattern: (A ? NOT(C) : NOR(B,C)) XXEVAL immediate value: 138
+ def : XXEvalPattern<
+ Vt, (vselect Vt:$vA, (VNot Vt:$vC), (VNor Vt:$vB, Vt:$vC)),
+ 138>;
+
+ // Pattern: (A ? NOT(B) : NOR(B,C)) XXEVAL immediate value: 140
+ def : XXEvalPattern<
+ Vt, (vselect Vt:$vA, (VNot Vt:$vB), (VNor Vt:$vB, Vt:$vC)),
+ 140>;
+
+ // Pattern: (A ? NAND(B,C) : NOR(B,C)) XXEVAL immediate value: 142
+ def : XXEvalPattern<
+ Vt, (vselect Vt:$vA, (VNand Vt:$vB, Vt:$vC), (VNor Vt:$vB, Vt:$vC)),
+ 142>;
+}
+
+// =============================================================================
+// XXEVAL Ternary Pattern Multiclass: XXEvalTernarySelectEqv
+// This class matches the equivalent Ternary Operation: A ? f(B,C) : EQV(B,C)
+// and emit the corresponding xxeval instruction with the imm value.
+//
+// The patterns implement xxeval vector select operations where:
+// - A is the selector vector
+// - f(B,C) is the "true" case op in set {OR(B,C), NOR(B,C), NAND(B,C), NOT(B),
+// NOT(C)}
+// - C is the "false" case op EQV(B,C)
+// =============================================================================
+multiclass XXEvalTernarySelectEqv<ValueType Vt>{
+ // Pattern: (A ? OR(B,C) : EQV(B,C)) XXEVAL immediate value: 151
+ def : XXEvalPattern<
+ Vt, (vselect Vt:$vA, (VOr Vt:$vB, Vt:$vC), (VEqv Vt:$vB, Vt:$vC)),
+ 151>;
+
+ // Pattern: (A ? NOR(B,C) : EQV(B,C)) XXEVAL immediate value: 152
+ def : XXEvalPattern<
+ Vt, (vselect Vt:$vA, (VNor Vt:$vB, Vt:$vC), (VEqv Vt:$vB, Vt:$vC)),
+ 152>;
+
+ // Pattern: (A ? NOT(C) : EQV(B,C)) XXEVAL immediate value: 154
+ def : XXEvalPattern<
+ Vt, (vselect Vt:$vA, (VNot Vt:$vC), (VEqv Vt:$vB, Vt:$vC)),
+ 154>;
+
+ // Pattern: (A ? NAND(B,C) : EQV(B,C)) XXEVAL immediate value: 158
+ def : XXEvalPattern<
+ Vt, (vselect Vt:$vA, (VNand Vt:$vB, Vt:$vC), (VEqv Vt:$vB, Vt:$vC)),
+ 158>;
+}
+
+// =============================================================================
+// XXEVAL Ternary Pattern Multiclass: XXEvalTernarySelectNotC
+// This class matches the equivalent Ternary Operation: A ? f(B,C) : NOT(C)
+// and emit the corresponding xxeval instruction with the imm value.
+//
+// The patterns implement xxeval vector select operations where:
+// - A is the selector vector
+// - f(B,C) is the "true" case op in set {AND(B,C), OR(B,C), XOR(B,C), NAND(B,C),
+// B, NOT(B)}
+// - C is the "false" case op NOT(C)
+// =============================================================================
+multiclass XXEvalTernarySelectNotC<ValueType Vt>{
+ // Pattern: (A ? AND(B,C) : NOT(C)) XXEVAL immediate value: 161
+ def : XXEvalPattern<
+ Vt, (vselect Vt:$vA, (VAnd Vt:$vB, Vt:$vC), (VNot Vt:$vC)), 161>;
+
+ // Pattern: (A ? B : NOT(C)) XXEVAL immediate value: 163
+ def : XXEvalPattern<Vt, (vselect Vt:$vA, Vt:$vB, (VNot Vt:$vC)), 163>;
+
+ // Pattern: (A ? XOR(B,C) : NOT(C)) XXEVAL immediate value: 166
+ def : XXEvalPattern<
+ Vt, (vselect Vt:$vA, (VXor Vt:$vB, Vt:$vC), (VNot Vt:$vC)), 166>;
+
+ // Pattern: (A ? OR(B,C) : NOT(C)) XXEVAL immediate value: 167
+ def : XXEvalPattern<
+ Vt, (vselect Vt:$vA, (VOr Vt:$vB, Vt:$vC), (VNot Vt:$vC)), 167>;
+
+ // Pattern: (A ? NOT(B) : NOT(C)) XXEVAL immediate value: 172
+ def : XXEvalPattern<Vt, (vselect Vt:$vA, (VNot Vt:$vB), (VNot Vt:$vC)), 172>;
+
+ // Pattern: (A ? NAND(B,C) : NOT(C)) XXEVAL immediate value: 174
+ def : XXEvalPattern<
+ Vt, (vselect Vt:$vA, (VNand Vt:$vB, Vt:$vC), (VNot Vt:$vC)), 174>;
+}
+
+// =============================================================================
+// XXEVAL Ternary Pattern Multiclass: XXEvalTernarySelectNotB
+// This class matches the equivalent Ternary Operation: A ? f(B,C) : NOT(B)
+// and emit the corresponding xxeval instruction with the imm value.
+//
+// The patterns implement xxeval vector select operations where:
+// - A is the selector vector
+// - f(B,C) is the "true" case op in set {AND(B,C), OR(B,C), XOR(B,C), NAND(B,C),
+// C, NOT(B)}
+// - C is the "false" case op NOT(B)
+// =============================================================================
+multiclass XXEvalTernarySelectNotB<ValueType Vt>{
+ // Pattern: (A ? AND(B,C) : NOT(B)) XXEVAL immediate value: 193
+ def : XXEvalPattern<
+ Vt, (vselect Vt:$vA, (VAnd Vt:$vB, Vt:$vC), (VNot Vt:$vB)), 193>;
+
+ // Pattern: (A ? C : NOT(B)) XXEVAL immediate value: 197
+ def : XXEvalPattern<Vt, (vselect Vt:$vA, Vt:$vC, (VNot Vt:$vB)), 197>;
+
+ // Pattern: (A ? XOR(B,C) : NOT(B)) XXEVAL immediate value: 198
+ def : XXEvalPattern<
+ Vt, (vselect Vt:$vA, (VXor Vt:$vB, Vt:$vC), (VNot Vt:$vB)), 198>;
+
+ // Pattern: (A ? OR(B,C) : NOT(B)) XXEVAL immediate value: 199
+ def : XXEvalPattern<
+ Vt, (vselect Vt:$vA, (VOr Vt:$vB, Vt:$vC), (VNot Vt:$vB)), 199>;
+
+ // Pattern: (A ? NOT(C) : NOT(B)) XXEVAL immediate value: 202
+ def : XXEvalPattern<Vt, (vselect Vt:$vA, (VNot Vt:$vC), (VNot Vt:$vB)), 202>;
+
+ // Pattern: (A ? NAND(B,C) : NOT(B)) XXEVAL immediate value: 206
+ def : XXEvalPattern<
+ Vt, (vselect Vt:$vA, (VNand Vt:$vB, Vt:$vC), (VNot Vt:$vB)), 206>;
+}
+
+// =============================================================================
+// XXEVAL Ternary Pattern Multiclass: XXEvalTernarySelectNand
+// This class matches the equivalent Ternary Operation: A ? f(B,C) : NAND(B,C)
+// and emit the corresponding xxeval instruction with the imm value.
+//
+// The patterns implement xxeval vector select operations where:
+// - A is the selector vector
+// - f(B,C) is the "true" case op in set {B, C, XOR(B,C), OR(B,C), EQV(B,C)}
+// - C is the "false" case op NAND(B,C)
+// =============================================================================
+multiclass XXEvalTernarySelectNand<ValueType Vt>{
+ // Pattern: (A ? B : NAND(B,C)) XXEVAL immediate value: 227
+ def : XXEvalPattern<
+ Vt, (vselect Vt:$vA, Vt:$vB, (VNand Vt:$vB, Vt:$vC)), 227>;
+
+ // Pattern: (A ? C : NAND(B,C)) XXEVAL immediate value: 229
+ def : XXEvalPattern<
+ Vt, (vselect Vt:$vA, Vt:$vC, (VNand Vt:$vB, Vt:$vC)), 229>;
+
+ // Pattern: (A ? XOR(B,C) : NAND(B,C)) XXEVAL immediate value: 230
+ def : XXEvalPattern<
+ Vt, (vselect Vt:$vA, (VXor Vt:$vB, Vt:$vC), (VNand Vt:$vB, Vt:$vC)),
+ 230>;
+
+ // Pattern: (A ? OR(B,C) : NAND(B,C)) XXEVAL immediate value: 231
+ def : XXEvalPattern<
+ Vt, (vselect Vt:$vA, (VOr Vt:$vB, Vt:$vC), (VNand Vt:$vB, Vt:$vC)),
+ 231>;
+
+ // Pattern: (A ? EQV(B,C) : NAND(B,C)) XXEVAL immediate value: 233
+ def : XXEvalPattern<
+ Vt, (vselect Vt:$vA, (VEqv Vt:$vB, Vt:$vC), (VNand Vt:$vB, Vt:$vC)),
+ 233>;
+}
+
let Predicates = [PrefixInstrs, HasP10Vector] in {
let AddedComplexity = 400 in {
def : Pat<(v4i32 (build_vector i32immNonAllOneNonZero:$A,
@@ -2519,6 +2703,11 @@ let Predicates = [PrefixInstrs, HasP10Vector] in {
defm : XXEvalTernarySelectC<Ty>;
defm : XXEvalTernarySelectXor<Ty>;
defm : XXEvalTernarySelectOr<Ty>;
+ defm : XXEvalTernarySelectNor<Ty>;
+ defm : XXEvalTernarySelectEqv<Ty>;
+ defm : XXEvalTernarySelectNotC<Ty>;
+ defm : XXEvalTernarySelectNotB<Ty>;
+ defm : XXEvalTernarySelectNand<Ty>;
}
// Anonymous patterns to select prefixed VSX loads and stores.
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 21dbb7c..e857b2d 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -1659,6 +1659,10 @@ bool RISCVAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
return generateImmOutOfRangeError(
Operands, ErrorInfo, -1, (1 << 5) - 1,
"immediate must be non-zero in the range");
+ case Match_InvalidXSfmmVType: {
+ SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
+ return generateXSfmmVTypeError(ErrorLoc);
+ }
case Match_InvalidVTypeI: {
SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
return generateVTypeError(ErrorLoc);
@@ -1688,7 +1692,7 @@ bool RISCVAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
(1 << 25) - 1);
// HACK: See comment before `BareSymbolQC_E_LI` in RISCVInstrInfoXqci.td.
case Match_InvalidBareSymbolQC_E_LI:
- LLVM_FALLTHROUGH;
+ [[fallthrough]];
// END HACK
case Match_InvalidBareSImm32:
return generateImmOutOfRangeError(Operands, ErrorInfo,
@@ -3352,10 +3356,10 @@ bool RISCVAsmParser::parseDirectiveAttribute() {
bool isValidInsnFormat(StringRef Format, const MCSubtargetInfo &STI) {
return StringSwitch<bool>(Format)
- .Cases("r", "r4", "i", "b", "sb", "u", "j", "uj", "s", true)
- .Cases("cr", "ci", "ciw", "css", "cl", "cs", "ca", "cb", "cj",
+ .Cases({"r", "r4", "i", "b", "sb", "u", "j", "uj", "s"}, true)
+ .Cases({"cr", "ci", "ciw", "css", "cl", "cs", "ca", "cb", "cj"},
STI.hasFeature(RISCV::FeatureStdExtZca))
- .Cases("qc.eai", "qc.ei", "qc.eb", "qc.ej", "qc.es",
+ .Cases({"qc.eai", "qc.ei", "qc.eb", "qc.ej", "qc.es"},
!STI.hasFeature(RISCV::Feature64Bit))
.Default(false);
}
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index 662d3f6..b1794b7 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -717,6 +717,18 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
.clampScalar(0, sXLen, sXLen)
.lower();
+ LegalityPredicate InsertVectorEltPred = [=](const LegalityQuery &Query) {
+ LLT VecTy = Query.Types[0];
+ LLT EltTy = Query.Types[1];
+ return VecTy.getElementType() == EltTy;
+ };
+
+ getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT)
+ .legalIf(all(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST),
+ InsertVectorEltPred, typeIs(2, sXLen)))
+ .legalIf(all(typeIsLegalBoolVec(0, BoolVecTys, ST), InsertVectorEltPred,
+ typeIs(2, sXLen)));
+
getLegacyLegalizerInfo().computeTables();
verify(*ST.getInstrInfo());
}
diff --git a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp
index 50730c6..ab93bba 100644
--- a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp
+++ b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp
@@ -43,7 +43,7 @@ const llvm::StringRef RISCVLMULInstrument::DESC_NAME = "RISCV-LMUL";
bool RISCVLMULInstrument::isDataValid(llvm::StringRef Data) {
// Return true if not one of the valid LMUL strings
return StringSwitch<bool>(Data)
- .Cases("M1", "M2", "M4", "M8", "MF2", "MF4", "MF8", true)
+ .Cases({"M1", "M2", "M4", "M8", "MF2", "MF4", "MF8"}, true)
.Default(false);
}
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index 70b7c43..e75dfe3 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -142,6 +142,22 @@ enum {
ReadsPastVLShift = DestEEWShift + 2,
ReadsPastVLMask = 1ULL << ReadsPastVLShift,
+
+ // 0 -> Don't care about altfmt bit in VTYPE.
+ // 1 -> Is not altfmt.
+ // 2 -> Is altfmt(BF16).
+ AltFmtTypeShift = ReadsPastVLShift + 1,
+ AltFmtTypeMask = 3ULL << AltFmtTypeShift,
+
+ // XSfmmbase
+ HasTWidenOpShift = AltFmtTypeShift + 2,
+ HasTWidenOpMask = 1ULL << HasTWidenOpShift,
+
+ HasTMOpShift = HasTWidenOpShift + 1,
+ HasTMOpMask = 1ULL << HasTMOpShift,
+
+ HasTKOpShift = HasTMOpShift + 1,
+ HasTKOpMask = 1ULL << HasTKOpShift,
};
// Helper functions to read TSFlags.
@@ -183,6 +199,11 @@ static inline bool hasRoundModeOp(uint64_t TSFlags) {
return TSFlags & HasRoundModeOpMask;
}
+enum class AltFmtType { DontCare, NotAltFmt, AltFmt };
+static inline AltFmtType getAltFmtType(uint64_t TSFlags) {
+ return static_cast<AltFmtType>((TSFlags & AltFmtTypeMask) >> AltFmtTypeShift);
+}
+
/// \returns true if this instruction uses vxrm
static inline bool usesVXRM(uint64_t TSFlags) { return TSFlags & UsesVXRMMask; }
@@ -204,11 +225,47 @@ static inline bool readsPastVL(uint64_t TSFlags) {
return TSFlags & ReadsPastVLMask;
}
+// XSfmmbase
+static inline bool hasTWidenOp(uint64_t TSFlags) {
+ return TSFlags & HasTWidenOpMask;
+}
+
+static inline bool hasTMOp(uint64_t TSFlags) { return TSFlags & HasTMOpMask; }
+
+static inline bool hasTKOp(uint64_t TSFlags) { return TSFlags & HasTKOpMask; }
+
+static inline unsigned getTNOpNum(const MCInstrDesc &Desc) {
+ const uint64_t TSFlags = Desc.TSFlags;
+ assert(hasTWidenOp(TSFlags) && hasVLOp(TSFlags));
+ unsigned Offset = 3;
+ if (hasTKOp(TSFlags))
+ Offset = 4;
+ return Desc.getNumOperands() - Offset;
+}
+
+static inline unsigned getTMOpNum(const MCInstrDesc &Desc) {
+ const uint64_t TSFlags = Desc.TSFlags;
+ assert(hasTWidenOp(TSFlags) && hasTMOp(TSFlags));
+ if (hasTKOp(TSFlags))
+ return Desc.getNumOperands() - 5;
+ // vtzero.t
+ return Desc.getNumOperands() - 4;
+}
+
+static inline unsigned getTKOpNum(const MCInstrDesc &Desc) {
+ [[maybe_unused]] const uint64_t TSFlags = Desc.TSFlags;
+ assert(hasTWidenOp(TSFlags) && hasTKOp(TSFlags));
+ return Desc.getNumOperands() - 3;
+}
+
static inline unsigned getVLOpNum(const MCInstrDesc &Desc) {
const uint64_t TSFlags = Desc.TSFlags;
// This method is only called if we expect to have a VL operand, and all
// instructions with VL also have SEW.
assert(hasSEWOp(TSFlags) && hasVLOp(TSFlags));
+ // In Xsfmmbase, TN is an alias for VL, so here we use the same TSFlags bit.
+ if (hasTWidenOp(TSFlags))
+ return getTNOpNum(Desc);
unsigned Offset = 2;
if (hasVecPolicyOp(TSFlags))
Offset = 3;
@@ -226,7 +283,7 @@ static inline unsigned getSEWOpNum(const MCInstrDesc &Desc) {
const uint64_t TSFlags = Desc.TSFlags;
assert(hasSEWOp(TSFlags));
unsigned Offset = 1;
- if (hasVecPolicyOp(TSFlags))
+ if (hasVecPolicyOp(TSFlags) || hasTWidenOp(TSFlags))
Offset = 2;
return Desc.getNumOperands() - Offset;
}
@@ -243,6 +300,9 @@ static inline int getFRMOpNum(const MCInstrDesc &Desc) {
if (!hasRoundModeOp(TSFlags) || usesVXRM(TSFlags))
return -1;
+ if (hasTWidenOp(TSFlags) && hasTMOp(TSFlags))
+ return getTMOpNum(Desc) - 1;
+
// The operand order
// --------------------------------------
// | n-1 (if any) | n-2 | n-3 | n-4 |
@@ -385,7 +445,9 @@ enum OperandType : unsigned {
OPERAND_SEW_MASK,
// Vector rounding mode for VXRM or FRM.
OPERAND_VEC_RM,
- OPERAND_LAST_RISCV_IMM = OPERAND_VEC_RM,
+ // Vtype operand for XSfmm extension.
+ OPERAND_XSFMM_VTYPE,
+ OPERAND_LAST_RISCV_IMM = OPERAND_XSFMM_VTYPE,
// Operand is either a register or uimm5, this is used by V extension pseudo
// instructions to represent a value that be passed as AVL to either vsetvli
// or vsetivli.
diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index 66ca436..de433e4 100644
--- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -1100,6 +1100,12 @@ static bool lowerRISCVVMachineInstrToMCInst(const MachineInstr *MI,
--NumOps;
if (RISCVII::hasRoundModeOp(TSFlags))
--NumOps;
+ if (RISCVII::hasTWidenOp(TSFlags))
+ --NumOps;
+ if (RISCVII::hasTMOp(TSFlags))
+ --NumOps;
+ if (RISCVII::hasTKOp(TSFlags))
+ --NumOps;
bool hasVLOutput = RISCVInstrInfo::isFaultOnlyFirstLoad(*MI);
for (unsigned OpNo = 0; OpNo != NumOps; ++OpNo) {
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 5ceb477..19992e6 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -695,6 +695,9 @@ def HasStdExtZvfbfa : Predicate<"Subtarget->hasStdExtZvfbfa()">,
def FeatureStdExtZvfbfmin
: RISCVExtension<1, 0, "Vector BF16 Converts", [FeatureStdExtZve32f]>;
+def HasStdExtZvfbfmin : Predicate<"Subtarget->hasStdExtZvfbfmin()">,
+ AssemblerPredicate<(all_of FeatureStdExtZvfbfmin),
+ "'Zvfbfmin' (Vector BF16 Converts)">;
def FeatureStdExtZvfbfwma
: RISCVExtension<1, 0, "Vector BF16 widening mul-add",
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 437022f..9a6afa1 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -516,6 +516,44 @@ void RISCVDAGToDAGISel::selectVSETVLI(SDNode *Node) {
CurDAG->getMachineNode(Opcode, DL, XLenVT, VLOperand, VTypeIOp));
}
+void RISCVDAGToDAGISel::selectXSfmmVSET(SDNode *Node) {
+ if (!Subtarget->hasVendorXSfmmbase())
+ return;
+
+ assert(Node->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Unexpected opcode");
+
+ SDLoc DL(Node);
+ MVT XLenVT = Subtarget->getXLenVT();
+
+ unsigned IntNo = Node->getConstantOperandVal(0);
+
+ assert((IntNo == Intrinsic::riscv_sf_vsettnt ||
+ IntNo == Intrinsic::riscv_sf_vsettm ||
+ IntNo == Intrinsic::riscv_sf_vsettk) &&
+ "Unexpected XSfmm vset intrinsic");
+
+ unsigned SEW = RISCVVType::decodeVSEW(Node->getConstantOperandVal(2));
+ unsigned Widen = RISCVVType::decodeTWiden(Node->getConstantOperandVal(3));
+ unsigned PseudoOpCode =
+ IntNo == Intrinsic::riscv_sf_vsettnt ? RISCV::PseudoSF_VSETTNT
+ : IntNo == Intrinsic::riscv_sf_vsettm ? RISCV::PseudoSF_VSETTM
+ : RISCV::PseudoSF_VSETTK;
+
+ if (IntNo == Intrinsic::riscv_sf_vsettnt) {
+ unsigned VTypeI = RISCVVType::encodeXSfmmVType(SEW, Widen, 0);
+ SDValue VTypeIOp = CurDAG->getTargetConstant(VTypeI, DL, XLenVT);
+
+ ReplaceNode(Node, CurDAG->getMachineNode(PseudoOpCode, DL, XLenVT,
+ Node->getOperand(1), VTypeIOp));
+ } else {
+ SDValue Log2SEW = CurDAG->getTargetConstant(Log2_32(SEW), DL, XLenVT);
+ SDValue TWiden = CurDAG->getTargetConstant(Widen, DL, XLenVT);
+ ReplaceNode(Node,
+ CurDAG->getMachineNode(PseudoOpCode, DL, XLenVT,
+ Node->getOperand(1), Log2SEW, TWiden));
+ }
+}
+
bool RISCVDAGToDAGISel::tryShrinkShlLogicImm(SDNode *Node) {
MVT VT = Node->getSimpleValueType(0);
unsigned Opcode = Node->getOpcode();
@@ -847,6 +885,11 @@ bool RISCVDAGToDAGISel::tryIndexedLoad(SDNode *Node) {
return true;
}
+static Register getTileReg(uint64_t TileNum) {
+ assert(TileNum <= 15 && "Invalid tile number");
+ return RISCV::T0 + TileNum;
+}
+
void RISCVDAGToDAGISel::selectSF_VC_X_SE(SDNode *Node) {
if (!Subtarget->hasVInstructions())
return;
@@ -2035,6 +2078,10 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
case Intrinsic::riscv_vsetvli:
case Intrinsic::riscv_vsetvlimax:
return selectVSETVLI(Node);
+ case Intrinsic::riscv_sf_vsettnt:
+ case Intrinsic::riscv_sf_vsettm:
+ case Intrinsic::riscv_sf_vsettk:
+ return selectXSfmmVSET(Node);
}
break;
}
@@ -2458,6 +2505,142 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
case Intrinsic::riscv_sf_vc_i_se:
selectSF_VC_X_SE(Node);
return;
+ case Intrinsic::riscv_sf_vlte8:
+ case Intrinsic::riscv_sf_vlte16:
+ case Intrinsic::riscv_sf_vlte32:
+ case Intrinsic::riscv_sf_vlte64: {
+ unsigned Log2SEW;
+ unsigned PseudoInst;
+ switch (IntNo) {
+ case Intrinsic::riscv_sf_vlte8:
+ PseudoInst = RISCV::PseudoSF_VLTE8;
+ Log2SEW = 3;
+ break;
+ case Intrinsic::riscv_sf_vlte16:
+ PseudoInst = RISCV::PseudoSF_VLTE16;
+ Log2SEW = 4;
+ break;
+ case Intrinsic::riscv_sf_vlte32:
+ PseudoInst = RISCV::PseudoSF_VLTE32;
+ Log2SEW = 5;
+ break;
+ case Intrinsic::riscv_sf_vlte64:
+ PseudoInst = RISCV::PseudoSF_VLTE64;
+ Log2SEW = 6;
+ break;
+ }
+
+ SDValue SEWOp = CurDAG->getTargetConstant(Log2SEW, DL, XLenVT);
+ SDValue TWidenOp = CurDAG->getTargetConstant(1, DL, XLenVT);
+ SDValue Operands[] = {Node->getOperand(2),
+ Node->getOperand(3),
+ Node->getOperand(4),
+ SEWOp,
+ TWidenOp,
+ Node->getOperand(0)};
+
+ MachineSDNode *TileLoad =
+ CurDAG->getMachineNode(PseudoInst, DL, Node->getVTList(), Operands);
+ if (auto *MemOp = dyn_cast<MemSDNode>(Node))
+ CurDAG->setNodeMemRefs(TileLoad, {MemOp->getMemOperand()});
+
+ ReplaceNode(Node, TileLoad);
+ return;
+ }
+ case Intrinsic::riscv_sf_mm_s_s:
+ case Intrinsic::riscv_sf_mm_s_u:
+ case Intrinsic::riscv_sf_mm_u_s:
+ case Intrinsic::riscv_sf_mm_u_u:
+ case Intrinsic::riscv_sf_mm_e5m2_e5m2:
+ case Intrinsic::riscv_sf_mm_e5m2_e4m3:
+ case Intrinsic::riscv_sf_mm_e4m3_e5m2:
+ case Intrinsic::riscv_sf_mm_e4m3_e4m3:
+ case Intrinsic::riscv_sf_mm_f_f: {
+ bool HasFRM = false;
+ unsigned PseudoInst;
+ switch (IntNo) {
+ case Intrinsic::riscv_sf_mm_s_s:
+ PseudoInst = RISCV::PseudoSF_MM_S_S;
+ break;
+ case Intrinsic::riscv_sf_mm_s_u:
+ PseudoInst = RISCV::PseudoSF_MM_S_U;
+ break;
+ case Intrinsic::riscv_sf_mm_u_s:
+ PseudoInst = RISCV::PseudoSF_MM_U_S;
+ break;
+ case Intrinsic::riscv_sf_mm_u_u:
+ PseudoInst = RISCV::PseudoSF_MM_U_U;
+ break;
+ case Intrinsic::riscv_sf_mm_e5m2_e5m2:
+ PseudoInst = RISCV::PseudoSF_MM_E5M2_E5M2;
+ HasFRM = true;
+ break;
+ case Intrinsic::riscv_sf_mm_e5m2_e4m3:
+ PseudoInst = RISCV::PseudoSF_MM_E5M2_E4M3;
+ HasFRM = true;
+ break;
+ case Intrinsic::riscv_sf_mm_e4m3_e5m2:
+ PseudoInst = RISCV::PseudoSF_MM_E4M3_E5M2;
+ HasFRM = true;
+ break;
+ case Intrinsic::riscv_sf_mm_e4m3_e4m3:
+ PseudoInst = RISCV::PseudoSF_MM_E4M3_E4M3;
+ HasFRM = true;
+ break;
+ case Intrinsic::riscv_sf_mm_f_f:
+ if (Node->getOperand(3).getValueType().getScalarType() == MVT::bf16)
+ PseudoInst = RISCV::PseudoSF_MM_F_F_ALT;
+ else
+ PseudoInst = RISCV::PseudoSF_MM_F_F;
+ HasFRM = true;
+ break;
+ }
+ uint64_t TileNum = Node->getConstantOperandVal(2);
+ SDValue Op1 = Node->getOperand(3);
+ SDValue Op2 = Node->getOperand(4);
+ MVT VT = Op1->getSimpleValueType(0);
+ unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits());
+ SDValue TmOp = Node->getOperand(5);
+ SDValue TnOp = Node->getOperand(6);
+ SDValue TkOp = Node->getOperand(7);
+ SDValue TWidenOp = Node->getOperand(8);
+ SDValue Chain = Node->getOperand(0);
+
+ // sf.mm.f.f with sew=32, twiden=2 is invalid
+ if (IntNo == Intrinsic::riscv_sf_mm_f_f && Log2SEW == 5 &&
+ TWidenOp->getAsZExtVal() == 2)
+ reportFatalUsageError("sf.mm.f.f doesn't support (sew=32, twiden=2)");
+
+ SmallVector<SDValue, 10> Operands(
+ {CurDAG->getRegister(getTileReg(TileNum), XLenVT), Op1, Op2});
+ if (HasFRM)
+ Operands.push_back(
+ CurDAG->getTargetConstant(RISCVFPRndMode::DYN, DL, XLenVT));
+ Operands.append({TmOp, TnOp, TkOp,
+ CurDAG->getTargetConstant(Log2SEW, DL, XLenVT), TWidenOp,
+ Chain});
+
+ auto *NewNode =
+ CurDAG->getMachineNode(PseudoInst, DL, Node->getVTList(), Operands);
+
+ ReplaceNode(Node, NewNode);
+ return;
+ }
+ case Intrinsic::riscv_sf_vtzero_t: {
+ uint64_t TileNum = Node->getConstantOperandVal(2);
+ SDValue Tm = Node->getOperand(3);
+ SDValue Tn = Node->getOperand(4);
+ SDValue Log2SEW = Node->getOperand(5);
+ SDValue TWiden = Node->getOperand(6);
+ SDValue Chain = Node->getOperand(0);
+ auto *NewNode = CurDAG->getMachineNode(
+ RISCV::PseudoSF_VTZERO_T, DL, Node->getVTList(),
+ {CurDAG->getRegister(getTileReg(TileNum), XLenVT), Tm, Tn, Log2SEW,
+ TWiden, Chain});
+
+ ReplaceNode(Node, NewNode);
+ return;
+ }
}
break;
}
@@ -3353,14 +3536,20 @@ bool RISCVDAGToDAGISel::selectSETCC(SDValue N, ISD::CondCode ExpectedCCVal,
0);
return true;
}
- // If the RHS is [-2047,2048], we can use addi with -RHS to produce 0 if the
- // LHS is equal to the RHS and non-zero otherwise.
+ // If the RHS is [-2047,2048], we can use addi/addiw with -RHS to produce 0
+ // if the LHS is equal to the RHS and non-zero otherwise.
if (isInt<12>(CVal) || CVal == 2048) {
- Val = SDValue(
- CurDAG->getMachineNode(
- RISCV::ADDI, DL, N->getValueType(0), LHS,
- CurDAG->getSignedTargetConstant(-CVal, DL, N->getValueType(0))),
- 0);
+ unsigned Opc = RISCV::ADDI;
+ if (LHS.getOpcode() == ISD::SIGN_EXTEND_INREG &&
+ cast<VTSDNode>(LHS.getOperand(1))->getVT() == MVT::i32) {
+ Opc = RISCV::ADDIW;
+ LHS = LHS.getOperand(0);
+ }
+
+ Val = SDValue(CurDAG->getMachineNode(Opc, DL, N->getValueType(0), LHS,
+ CurDAG->getSignedTargetConstant(
+ -CVal, DL, N->getValueType(0))),
+ 0);
return true;
}
if (isPowerOf2_64(CVal) && Subtarget->hasStdExtZbs()) {
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
index f03b44c..19ee103 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
@@ -165,6 +165,7 @@ public:
void selectVSXSEG(SDNode *Node, unsigned NF, bool IsMasked, bool IsOrdered);
void selectVSETVLI(SDNode *Node);
+ void selectXSfmmVSET(SDNode *Node);
void selectSF_VC_X_SE(SDNode *Node);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 7123a2d..169465e 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1672,6 +1672,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
if (Subtarget.useRVVForFixedLengthVectors())
setTargetDAGCombine(ISD::BITCAST);
+ setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
+
// Disable strict node mutation.
IsStrictFPEnabled = true;
EnableExtLdPromotion = true;
@@ -24828,7 +24830,8 @@ bool RISCVTargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
// instruction, as it is usually smaller than the alternative sequence.
// TODO: Add vector division?
bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
- return OptSize && !VT.isVector();
+ return OptSize && !VT.isVector() &&
+ VT.getSizeInBits() <= getMaxDivRemBitWidthSupported();
}
bool RISCVTargetLowering::preferScalarizeSplat(SDNode *N) const {
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index cf8d120..1b7cb9b 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -168,10 +168,13 @@ struct DemandedFields {
// If this is true, we demand that VTYPE is set to some legal state, i.e. that
// vill is unset.
bool VILL = false;
+ bool TWiden = false;
+ bool AltFmt = false;
// Return true if any part of VTYPE was used
bool usedVTYPE() const {
- return SEW || LMUL || SEWLMULRatio || TailPolicy || MaskPolicy || VILL;
+ return SEW || LMUL || SEWLMULRatio || TailPolicy || MaskPolicy || VILL ||
+ TWiden || AltFmt;
}
// Return true if any property of VL was used
@@ -187,6 +190,8 @@ struct DemandedFields {
TailPolicy = true;
MaskPolicy = true;
VILL = true;
+ TWiden = true;
+ AltFmt = true;
}
// Mark all VL properties as demanded
@@ -212,6 +217,8 @@ struct DemandedFields {
TailPolicy |= B.TailPolicy;
MaskPolicy |= B.MaskPolicy;
VILL |= B.VILL;
+ AltFmt |= B.AltFmt;
+ TWiden |= B.TWiden;
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -258,7 +265,9 @@ struct DemandedFields {
OS << "SEWLMULRatio=" << SEWLMULRatio << ", ";
OS << "TailPolicy=" << TailPolicy << ", ";
OS << "MaskPolicy=" << MaskPolicy << ", ";
- OS << "VILL=" << VILL;
+ OS << "VILL=" << VILL << ", ";
+ OS << "AltFmt=" << AltFmt << ", ";
+ OS << "TWiden=" << TWiden;
OS << "}";
}
#endif
@@ -328,6 +337,15 @@ static bool areCompatibleVTYPEs(uint64_t CurVType, uint64_t NewVType,
if (Used.MaskPolicy && RISCVVType::isMaskAgnostic(CurVType) !=
RISCVVType::isMaskAgnostic(NewVType))
return false;
+ if (Used.TWiden && (RISCVVType::hasXSfmmWiden(CurVType) !=
+ RISCVVType::hasXSfmmWiden(NewVType) ||
+ (RISCVVType::hasXSfmmWiden(CurVType) &&
+ RISCVVType::getXSfmmWiden(CurVType) !=
+ RISCVVType::getXSfmmWiden(NewVType))))
+ return false;
+ if (Used.AltFmt &&
+ RISCVVType::isAltFmt(CurVType) != RISCVVType::isAltFmt(NewVType))
+ return false;
return true;
}
@@ -479,6 +497,11 @@ DemandedFields getDemanded(const MachineInstr &MI, const RISCVSubtarget *ST) {
Res.TailPolicy = false;
}
+ Res.AltFmt = RISCVII::getAltFmtType(MI.getDesc().TSFlags) !=
+ RISCVII::AltFmtType::DontCare;
+ Res.TWiden = RISCVII::hasTWidenOp(MI.getDesc().TSFlags) ||
+ RISCVInstrInfo::isXSfmmVectorConfigInstr(MI);
+
return Res;
}
@@ -510,6 +533,8 @@ class VSETVLIInfo {
uint8_t TailAgnostic : 1;
uint8_t MaskAgnostic : 1;
uint8_t SEWLMULRatioOnly : 1;
+ uint8_t AltFmt : 1;
+ uint8_t TWiden : 3;
public:
VSETVLIInfo()
@@ -586,6 +611,8 @@ public:
RISCVVType::VLMUL getVLMUL() const { return VLMul; }
bool getTailAgnostic() const { return TailAgnostic; }
bool getMaskAgnostic() const { return MaskAgnostic; }
+ bool getAltFmt() const { return AltFmt; }
+ unsigned getTWiden() const { return TWiden; }
bool hasNonZeroAVL(const LiveIntervals *LIS) const {
if (hasAVLImm())
@@ -647,21 +674,31 @@ public:
SEW = RISCVVType::getSEW(VType);
TailAgnostic = RISCVVType::isTailAgnostic(VType);
MaskAgnostic = RISCVVType::isMaskAgnostic(VType);
+ AltFmt = RISCVVType::isAltFmt(VType);
+ TWiden =
+ RISCVVType::hasXSfmmWiden(VType) ? RISCVVType::getXSfmmWiden(VType) : 0;
}
- void setVTYPE(RISCVVType::VLMUL L, unsigned S, bool TA, bool MA) {
+ void setVTYPE(RISCVVType::VLMUL L, unsigned S, bool TA, bool MA, bool Altfmt,
+ unsigned W) {
assert(isValid() && !isUnknown() &&
"Can't set VTYPE for uninitialized or unknown");
VLMul = L;
SEW = S;
TailAgnostic = TA;
MaskAgnostic = MA;
+ AltFmt = Altfmt;
+ TWiden = W;
}
+ void setAltFmt(bool AF) { AltFmt = AF; }
+
void setVLMul(RISCVVType::VLMUL VLMul) { this->VLMul = VLMul; }
unsigned encodeVTYPE() const {
assert(isValid() && !isUnknown() && !SEWLMULRatioOnly &&
"Can't encode VTYPE for uninitialized or unknown");
+ if (TWiden != 0)
+ return RISCVVType::encodeXSfmmVType(SEW, TWiden, AltFmt);
return RISCVVType::encodeVTYPE(VLMul, SEW, TailAgnostic, MaskAgnostic);
}
@@ -674,9 +711,9 @@ public:
"Can't compare VTYPE in unknown state");
assert(!SEWLMULRatioOnly && !Other.SEWLMULRatioOnly &&
"Can't compare when only LMUL/SEW ratio is valid.");
- return std::tie(VLMul, SEW, TailAgnostic, MaskAgnostic) ==
+ return std::tie(VLMul, SEW, TailAgnostic, MaskAgnostic, AltFmt, TWiden) ==
std::tie(Other.VLMul, Other.SEW, Other.TailAgnostic,
- Other.MaskAgnostic);
+ Other.MaskAgnostic, Other.AltFmt, Other.TWiden);
}
unsigned getSEWLMULRatio() const {
@@ -825,7 +862,9 @@ public:
<< "SEW=e" << (unsigned)SEW << ", "
<< "TailAgnostic=" << (bool)TailAgnostic << ", "
<< "MaskAgnostic=" << (bool)MaskAgnostic << ", "
- << "SEWLMULRatioOnly=" << (bool)SEWLMULRatioOnly << "}";
+ << "SEWLMULRatioOnly=" << (bool)SEWLMULRatioOnly << ", "
+ << "TWiden=" << (unsigned)TWiden << ", "
+ << "AltFmt=" << (bool)AltFmt << "}";
}
#endif
};
@@ -853,6 +892,11 @@ struct BlockData {
BlockData() = default;
};
+enum TKTMMode {
+ VSETTK = 0,
+ VSETTM = 1,
+};
+
class RISCVInsertVSETVLI : public MachineFunctionPass {
const RISCVSubtarget *ST;
const TargetInstrInfo *TII;
@@ -908,6 +952,7 @@ private:
VSETVLIInfo getInfoForVSETVLI(const MachineInstr &MI) const;
VSETVLIInfo computeInfoForInstr(const MachineInstr &MI) const;
void forwardVSETVLIAVL(VSETVLIInfo &Info) const;
+ bool insertVSETMTK(MachineBasicBlock &MBB, TKTMMode Mode) const;
};
} // end anonymous namespace
@@ -945,6 +990,18 @@ RISCVInsertVSETVLI::getInfoForVSETVLI(const MachineInstr &MI) const {
VSETVLIInfo NewInfo;
if (MI.getOpcode() == RISCV::PseudoVSETIVLI) {
NewInfo.setAVLImm(MI.getOperand(1).getImm());
+ } else if (RISCVInstrInfo::isXSfmmVectorConfigTNInstr(MI)) {
+ assert(MI.getOpcode() == RISCV::PseudoSF_VSETTNT ||
+ MI.getOpcode() == RISCV::PseudoSF_VSETTNTX0);
+ switch (MI.getOpcode()) {
+ case RISCV::PseudoSF_VSETTNTX0:
+ NewInfo.setAVLVLMAX();
+ break;
+ case RISCV::PseudoSF_VSETTNT:
+ Register ATNReg = MI.getOperand(1).getReg();
+ NewInfo.setAVLRegDef(getVNInfoFromReg(ATNReg, MI, LIS), ATNReg);
+ break;
+ }
} else {
assert(MI.getOpcode() == RISCV::PseudoVSETVLI ||
MI.getOpcode() == RISCV::PseudoVSETVLIX0);
@@ -1005,11 +1062,34 @@ RISCVInsertVSETVLI::computeInfoForInstr(const MachineInstr &MI) const {
RISCVVType::VLMUL VLMul = RISCVII::getLMul(TSFlags);
+ bool AltFmt = RISCVII::getAltFmtType(TSFlags) == RISCVII::AltFmtType::AltFmt;
+ InstrInfo.setAltFmt(AltFmt);
+
unsigned Log2SEW = MI.getOperand(getSEWOpNum(MI)).getImm();
// A Log2SEW of 0 is an operation on mask registers only.
unsigned SEW = Log2SEW ? 1 << Log2SEW : 8;
assert(RISCVVType::isValidSEW(SEW) && "Unexpected SEW");
+ if (RISCVII::hasTWidenOp(TSFlags)) {
+ const MachineOperand &TWidenOp =
+ MI.getOperand(MI.getNumExplicitOperands() - 1);
+ unsigned TWiden = TWidenOp.getImm();
+
+ InstrInfo.setAVLVLMAX();
+ if (RISCVII::hasVLOp(TSFlags)) {
+ const MachineOperand &TNOp =
+ MI.getOperand(RISCVII::getTNOpNum(MI.getDesc()));
+
+ if (TNOp.getReg().isVirtual())
+ InstrInfo.setAVLRegDef(getVNInfoFromReg(TNOp.getReg(), MI, LIS),
+ TNOp.getReg());
+ }
+
+ InstrInfo.setVTYPE(VLMul, SEW, TailAgnostic, MaskAgnostic, AltFmt, TWiden);
+
+ return InstrInfo;
+ }
+
if (RISCVII::hasVLOp(TSFlags)) {
const MachineOperand &VLOp = MI.getOperand(getVLOpNum(MI));
if (VLOp.isImm()) {
@@ -1045,7 +1125,9 @@ RISCVInsertVSETVLI::computeInfoForInstr(const MachineInstr &MI) const {
assert(SEW == EEW && "Initial SEW doesn't match expected EEW");
}
#endif
- InstrInfo.setVTYPE(VLMul, SEW, TailAgnostic, MaskAgnostic);
+ // TODO: Propagate the twiden from previous vtype for potential reuse.
+ InstrInfo.setVTYPE(VLMul, SEW, TailAgnostic, MaskAgnostic, AltFmt,
+ /*TWiden*/ 0);
forwardVSETVLIAVL(InstrInfo);
@@ -1053,10 +1135,33 @@ RISCVInsertVSETVLI::computeInfoForInstr(const MachineInstr &MI) const {
}
void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator InsertPt, DebugLoc DL,
- const VSETVLIInfo &Info, const VSETVLIInfo &PrevInfo) {
-
+ MachineBasicBlock::iterator InsertPt,
+ DebugLoc DL, const VSETVLIInfo &Info,
+ const VSETVLIInfo &PrevInfo) {
++NumInsertedVSETVL;
+
+ if (Info.getTWiden()) {
+ if (Info.hasAVLVLMAX()) {
+ Register DestReg = MRI->createVirtualRegister(&RISCV::GPRNoX0RegClass);
+ auto MI = BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoSF_VSETTNTX0))
+ .addReg(DestReg, RegState::Define | RegState::Dead)
+ .addReg(RISCV::X0, RegState::Kill)
+ .addImm(Info.encodeVTYPE());
+ if (LIS) {
+ LIS->InsertMachineInstrInMaps(*MI);
+ LIS->createAndComputeVirtRegInterval(DestReg);
+ }
+ } else {
+ auto MI = BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoSF_VSETTNT))
+ .addReg(RISCV::X0, RegState::Define | RegState::Dead)
+ .addReg(Info.getAVLReg())
+ .addImm(Info.encodeVTYPE());
+ if (LIS)
+ LIS->InsertMachineInstrInMaps(*MI);
+ }
+ return;
+ }
+
if (PrevInfo.isValid() && !PrevInfo.isUnknown()) {
// Use X0, X0 form if the AVL is the same and the SEW+LMUL gives the same
// VLMAX.
@@ -1198,7 +1303,8 @@ void RISCVInsertVSETVLI::transferBefore(VSETVLIInfo &Info,
// be coalesced into another vsetvli since we won't demand any fields.
VSETVLIInfo NewInfo; // Need a new VSETVLIInfo to clear SEWLMULRatioOnly
NewInfo.setAVLImm(1);
- NewInfo.setVTYPE(RISCVVType::LMUL_1, /*sew*/ 8, /*ta*/ true, /*ma*/ true);
+ NewInfo.setVTYPE(RISCVVType::LMUL_1, /*sew*/ 8, /*ta*/ true, /*ma*/ true,
+ /*AltFmt*/ false, /*W*/ 0);
Info = NewInfo;
return;
}
@@ -1240,7 +1346,9 @@ void RISCVInsertVSETVLI::transferBefore(VSETVLIInfo &Info,
(Demanded.TailPolicy ? IncomingInfo : Info).getTailAgnostic() ||
IncomingInfo.getTailAgnostic(),
(Demanded.MaskPolicy ? IncomingInfo : Info).getMaskAgnostic() ||
- IncomingInfo.getMaskAgnostic());
+ IncomingInfo.getMaskAgnostic(),
+ (Demanded.AltFmt ? IncomingInfo : Info).getAltFmt(),
+ Demanded.TWiden ? IncomingInfo.getTWiden() : 0);
// If we only knew the sew/lmul ratio previously, replace the VTYPE but keep
// the AVL.
@@ -1293,7 +1401,8 @@ bool RISCVInsertVSETVLI::computeVLVTYPEChanges(const MachineBasicBlock &MBB,
if (RISCVInstrInfo::isVectorConfigInstr(MI) ||
RISCVII::hasSEWOp(MI.getDesc().TSFlags) ||
- isVectorCopy(ST->getRegisterInfo(), MI))
+ isVectorCopy(ST->getRegisterInfo(), MI) ||
+ RISCVInstrInfo::isXSfmmVectorConfigInstr(MI))
HadVectorOp = true;
transferAfter(Info, MI);
@@ -1675,6 +1784,12 @@ void RISCVInsertVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) const {
};
for (MachineInstr &MI : make_early_inc_range(reverse(MBB))) {
+ // TODO: Support XSfmm.
+ if (RISCVII::hasTWidenOp(MI.getDesc().TSFlags) ||
+ RISCVInstrInfo::isXSfmmVectorConfigInstr(MI)) {
+ NextMI = nullptr;
+ continue;
+ }
if (!RISCVInstrInfo::isVectorConfigInstr(MI)) {
Used.doUnion(getDemanded(MI, ST));
@@ -1788,6 +1903,65 @@ void RISCVInsertVSETVLI::insertReadVL(MachineBasicBlock &MBB) {
}
}
+bool RISCVInsertVSETVLI::insertVSETMTK(MachineBasicBlock &MBB,
+ TKTMMode Mode) const {
+
+ bool Changed = false;
+ for (auto &MI : MBB) {
+ uint64_t TSFlags = MI.getDesc().TSFlags;
+ if (RISCVInstrInfo::isXSfmmVectorConfigTMTKInstr(MI) ||
+ !RISCVII::hasSEWOp(TSFlags) || !RISCVII::hasTWidenOp(TSFlags))
+ continue;
+
+ VSETVLIInfo CurrInfo = computeInfoForInstr(MI);
+
+ if (Mode == VSETTK && !RISCVII::hasTKOp(TSFlags))
+ continue;
+
+ if (Mode == VSETTM && !RISCVII::hasTMOp(TSFlags))
+ continue;
+
+ unsigned OpNum = 0;
+ unsigned Opcode = 0;
+ switch (Mode) {
+ case VSETTK:
+ OpNum = RISCVII::getTKOpNum(MI.getDesc());
+ Opcode = RISCV::PseudoSF_VSETTK;
+ break;
+ case VSETTM:
+ OpNum = RISCVII::getTMOpNum(MI.getDesc());
+ Opcode = RISCV::PseudoSF_VSETTM;
+ break;
+ }
+
+ assert(OpNum && Opcode && "Invalid OpNum or Opcode");
+
+ MachineOperand &Op = MI.getOperand(OpNum);
+
+ auto TmpMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opcode))
+ .addReg(RISCV::X0, RegState::Define | RegState::Dead)
+ .addReg(Op.getReg())
+ .addImm(Log2_32(CurrInfo.getSEW()))
+ .addImm(Log2_32(CurrInfo.getTWiden()) + 1);
+
+ Changed = true;
+ Register Reg = Op.getReg();
+ Op.setReg(Register());
+ Op.setIsKill(false);
+ if (LIS) {
+ LIS->InsertMachineInstrInMaps(*TmpMI);
+ LiveInterval &LI = LIS->getInterval(Reg);
+
+ // Erase the AVL operand from the instruction.
+ LIS->shrinkToUses(&LI);
+ // TODO: Enable this once needVSETVLIPHI is supported.
+ // SmallVector<LiveInterval *> SplitLIs;
+ // LIS->splitSeparateComponents(LI, SplitLIs);
+ }
+ }
+ return Changed;
+}
+
bool RISCVInsertVSETVLI::runOnMachineFunction(MachineFunction &MF) {
// Skip if the vector extension is not enabled.
ST = &MF.getSubtarget<RISCVSubtarget>();
@@ -1865,6 +2039,11 @@ bool RISCVInsertVSETVLI::runOnMachineFunction(MachineFunction &MF) {
for (MachineBasicBlock &MBB : MF)
insertReadVL(MBB);
+ for (MachineBasicBlock &MBB : MF) {
+ insertVSETMTK(MBB, VSETTM);
+ insertVSETMTK(MBB, VSETTK);
+ }
+
BlockInfo.clear();
return HaveVectorOp;
}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
index 2afd77a..fee1d15 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrFormats.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
@@ -177,7 +177,7 @@ def EltDepsVL : EltDeps<vl=1, mask=0>;
def EltDepsMask : EltDeps<vl=0, mask=1>;
def EltDepsVLMask : EltDeps<vl=1, mask=1>;
-class EEW <bits<2> val> {
+class EEW<bits<2> val> {
bits<2> Value = val;
}
def EEW1 : EEW<0>;
@@ -185,6 +185,13 @@ def EEWSEWx1 : EEW<1>;
def EEWSEWx2 : EEW<2>;
def EEWSEWx4 : EEW<3>;
+class AltFmtType<bits<2> val> {
+ bits<2> Value = val;
+}
+def DONT_CARE_ALTFMT : AltFmtType<0>;
+def IS_NOT_ALTFMT : AltFmtType<1>;
+def IS_ALTFMT : AltFmtType<2>;
+
class RVInstCommon<dag outs, dag ins, string opcodestr, string argstr,
list<dag> pattern, InstFormat format> : Instruction {
let Namespace = "RISCV";
@@ -267,6 +274,22 @@ class RVInstCommon<dag outs, dag ins, string opcodestr, string argstr,
// operands' VLs.
bit ReadsPastVL = 0;
let TSFlags{26} = ReadsPastVL;
+
+ // 0 -> Don't care about altfmt bit in VTYPE.
+ // 1 -> Is not altfmt.
+ // 2 -> Is altfmt(BF16).
+ AltFmtType AltFmtType = DONT_CARE_ALTFMT;
+ let TSFlags{28-27} = AltFmtType.Value;
+
+ // XSfmmbase
+ bit HasTWidenOp = 0;
+ let TSFlags{29} = HasTWidenOp;
+
+ bit HasTmOp = 0;
+ let TSFlags{30} = HasTmOp;
+
+ bit HasTkOp = 0;
+ let TSFlags{31} = HasTkOp;
}
class RVInst<dag outs, dag ins, string opcodestr, string argstr,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 96e1078..ddb53a2 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -3005,6 +3005,9 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI,
else
Ok = RISCVFPRndMode::isValidRoundingMode(Imm);
break;
+ case RISCVOp::OPERAND_XSFMM_VTYPE:
+ Ok = RISCVVType::isValidXSfmmVType(Imm);
+ break;
}
if (!Ok) {
ErrInfo = "Invalid immediate";
@@ -3670,6 +3673,11 @@ std::string RISCVInstrInfo::createMIROperandComment(
RISCVVType::printVType(Imm, OS);
break;
}
+ case RISCVOp::OPERAND_XSFMM_VTYPE: {
+ unsigned Imm = Op.getImm();
+ RISCVVType::printXSfmmVType(Imm, OS);
+ break;
+ }
case RISCVOp::OPERAND_SEW:
case RISCVOp::OPERAND_SEW_MASK: {
unsigned Log2SEW = Op.getImm();
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 298d35a..eb3c9b0 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -159,7 +159,8 @@ class PseudoToVInst<string PseudoInst> {
["_M4", ""],
["_M8", ""],
["_SE", ""],
- ["_RM", ""]
+ ["_RM", ""],
+ ["_ALT", ""]
];
string VInst = !foldl(PseudoInst, AffixSubsts, Acc, AffixSubst,
!subst(AffixSubst[0], AffixSubst[1], Acc));
@@ -5861,20 +5862,6 @@ multiclass VPatConversionWF_VF<string intrinsic, string instruction,
}
}
-multiclass VPatConversionWF_VF_BF<string intrinsic, string instruction,
- bit isSEWAware = 0> {
- foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in
- {
- defvar fvti = fvtiToFWti.Vti;
- defvar fwti = fvtiToFWti.Wti;
- let Predicates = !listconcat(GetVTypePredicates<fvti>.Predicates,
- GetVTypePredicates<fwti>.Predicates) in
- defm : VPatConversion<intrinsic, instruction, "V",
- fwti.Vector, fvti.Vector, fwti.Mask, fvti.Log2SEW,
- fvti.LMul, fwti.RegClass, fvti.RegClass, isSEWAware>;
- }
-}
-
multiclass VPatConversionVI_WF<string intrinsic, string instruction> {
foreach vtiToWti = AllWidenableIntToFloatVectors in {
defvar vti = vtiToWti.Vti;
@@ -5968,20 +5955,6 @@ multiclass VPatConversionVF_WF_RTZ<string intrinsic, string instruction,
}
}
-multiclass VPatConversionVF_WF_BF_RM<string intrinsic, string instruction,
- bit isSEWAware = 0> {
- foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in {
- defvar fvti = fvtiToFWti.Vti;
- defvar fwti = fvtiToFWti.Wti;
- let Predicates = !listconcat(GetVTypePredicates<fvti>.Predicates,
- GetVTypePredicates<fwti>.Predicates) in
- defm : VPatConversionRoundingMode<intrinsic, instruction, "W",
- fvti.Vector, fwti.Vector, fvti.Mask, fvti.Log2SEW,
- fvti.LMul, fvti.RegClass, fwti.RegClass,
- isSEWAware>;
- }
-}
-
multiclass VPatCompare_VI<string intrinsic, string inst,
ImmLeaf ImmType> {
foreach vti = AllIntegerVectors in {
@@ -6396,7 +6369,7 @@ let Defs = [VXSAT] in {
// 13. Vector Floating-Point Instructions
//===----------------------------------------------------------------------===//
-let Predicates = [HasVInstructionsAnyF] in {
+let Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT in {
//===----------------------------------------------------------------------===//
// 13.2. Vector Single-Width Floating-Point Add/Subtract Instructions
//===----------------------------------------------------------------------===//
@@ -6565,7 +6538,7 @@ defm PseudoVFNCVT_F_F : VPseudoVNCVTD_W_RM;
defm PseudoVFNCVT_ROD_F_F : VPseudoVNCVTD_W;
} // mayRaiseFPException = true
-} // Predicates = [HasVInstructionsAnyF]
+} // Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT
//===----------------------------------------------------------------------===//
// 14. Vector Reduction Operations
@@ -6593,7 +6566,7 @@ defm PseudoVWREDSUM : VPseudoVWRED_VS;
}
} // Predicates = [HasVInstructions]
-let Predicates = [HasVInstructionsAnyF] in {
+let Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT in {
//===----------------------------------------------------------------------===//
// 14.3. Vector Single-Width Floating-Point Reduction Instructions
//===----------------------------------------------------------------------===//
@@ -6612,7 +6585,7 @@ defm PseudoVFWREDUSUM : VPseudoVFWRED_VS_RM;
defm PseudoVFWREDOSUM : VPseudoVFWREDO_VS_RM;
}
-} // Predicates = [HasVInstructionsAnyF]
+} // Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT
//===----------------------------------------------------------------------===//
// 15. Vector Mask Instructions
@@ -6703,7 +6676,7 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
// 16.2. Floating-Point Scalar Move Instructions
//===----------------------------------------------------------------------===//
-let Predicates = [HasVInstructionsAnyF] in {
+let Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT in {
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
foreach f = FPList in {
let HasSEWOp = 1, BaseInstr = VFMV_F_S in
@@ -6718,7 +6691,7 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
Sched<[WriteVMovSF, ReadVMovSF_V, ReadVMovSF_F]>;
}
}
-} // Predicates = [HasVInstructionsAnyF]
+} // Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT
//===----------------------------------------------------------------------===//
// 16.3. Vector Slide Instructions
@@ -6730,10 +6703,10 @@ let Predicates = [HasVInstructions] in {
defm PseudoVSLIDE1DOWN : VPseudoVSLD1_VX;
} // Predicates = [HasVInstructions]
-let Predicates = [HasVInstructionsAnyF] in {
+let Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT in {
defm PseudoVFSLIDE1UP : VPseudoVSLD1_VF<"@earlyclobber $rd">;
defm PseudoVFSLIDE1DOWN : VPseudoVSLD1_VF;
-} // Predicates = [HasVInstructionsAnyF]
+} // Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT
//===----------------------------------------------------------------------===//
// 16.4. Vector Register Gather Instructions
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
index 557d873..6a4119a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
@@ -438,8 +438,10 @@ let Predicates = [HasVendorXSfvcp] in {
}
foreach f = FPList in {
foreach m = f.MxList in {
- defm f.FX # "V" : VPseudoVC_XV<m, f.fprclass, payload1>;
- defm f.FX # "VV" : VPseudoVC_XVV<m, f.fprclass, payload1>;
+ let AltFmtType = IS_NOT_ALTFMT in {
+ defm f.FX # "V" : VPseudoVC_XV<m, f.fprclass, payload1>;
+ defm f.FX # "VV" : VPseudoVC_XVV<m, f.fprclass, payload1>;
+ }
}
}
foreach m = MxListW in {
@@ -449,7 +451,8 @@ let Predicates = [HasVendorXSfvcp] in {
}
foreach f = FPListW in {
foreach m = f.MxList in
- defm f.FX # "VW" : VPseudoVC_XVW<m, f.fprclass, payload1>;
+ let AltFmtType = IS_NOT_ALTFMT in
+ defm f.FX # "VW" : VPseudoVC_XVW<m, f.fprclass, payload1>;
}
}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td
index a5ee701..d77a44a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td
@@ -225,7 +225,7 @@ let Predicates = [HasVendorXSfmmbase] in {
def SF_VSETTM : SFInstSetSingle<(outs GPR:$rd), (ins GPR:$rs1), 0b00001,
"sf.vsettm", "$rd, $rs1">;
def SF_VSETTK : SFInstSetSingle<(outs GPR:$rd), (ins GPR:$rs1), 0b00010,
- "sf.vsettk", "$rd, $rs1">;
+ "sf.vsettk", "$rd, $rs1">;
def SF_VTDISCARD : SFInstVtDiscard<"sf.vtdiscard">;
def SF_VTMV_V_T : SFInstTileMoveOp<0b010000, (outs VR:$vd), (ins GPR:$rs1),
@@ -277,3 +277,195 @@ let Uses = [FRM], mayRaiseFPException = true in {
} // Predicates = [HasVendorXSfmm32a8f]
} // DecoderNamespace = "XSfvector"
+
+class VPseudoSF_VTileLoad
+ : RISCVVPseudo<(outs), (ins GPR:$rs2, GPR:$rs1, AVL:$atn, ixlenimm:$sew,
+ ixlenimm:$twiden)> {
+ let mayLoad = 1;
+ let mayStore = 0;
+ let HasVLOp = 1; // Tn
+ let HasSEWOp = 1;
+ let HasTWidenOp = 1;
+ let hasSideEffects = 1;
+}
+
+class VPseudoSF_VTileStore
+ : RISCVVPseudo<(outs), (ins GPR:$rs2, GPR:$rs1, AVL:$atn, ixlenimm:$sew,
+ ixlenimm:$twiden)> {
+ let mayLoad = 0;
+ let mayStore = 1;
+ let HasVLOp = 1; // Tn
+ let HasSEWOp = 1;
+ let HasTWidenOp = 1;
+ let hasSideEffects = 1;
+}
+
+class VPseudoSF_VTileMove_V_T
+ : RISCVVPseudo<(outs VRM8:$vd), (ins GPR:$rs1, AVL:$atn, ixlenimm:$sew,
+ ixlenimm:$twiden)> {
+ let mayLoad = 0;
+ let mayStore = 0;
+ let HasVLOp = 1; // Tn
+ let HasSEWOp = 1;
+ let HasTWidenOp = 1;
+ let hasSideEffects = 1;
+}
+
+class VPseudoSF_VTileMove_T_V
+ : RISCVVPseudo<(outs), (ins GPR:$rs1, VRM8:$vs2, AVL:$atn, ixlenimm:$sew,
+ ixlenimm:$twiden)> {
+ let mayLoad = 0;
+ let mayStore = 0;
+ let HasVLOp = 1; // Tn
+ let HasSEWOp = 1;
+ let HasTWidenOp = 1;
+ let hasSideEffects = 1;
+}
+
+class VPseudoSF_MatMul<RegisterClass mtd_class>
+ : RISCVVPseudo<(outs),
+ (ins mtd_class:$rd, VRM8:$vs2, VRM8:$vs1, AVL:$atm, AVL:$atn,
+ AVL:$atk, ixlenimm:$sew, ixlenimm:$twiden)> {
+ let mayLoad = 0;
+ let mayStore = 0;
+ let HasTmOp = 1;
+ let HasVLOp = 1; // Tn
+ let HasTkOp = 1;
+ let HasSEWOp = 1;
+ let HasTWidenOp = 1;
+ let hasSideEffects = 1;
+}
+
+class VPseudoSF_MatMul_FRM<RegisterClass mtd_class>
+ : RISCVVPseudo<(outs),
+ (ins mtd_class:$rd, VRM8:$vs2, VRM8:$vs1, ixlenimm:$frm,
+ AVL:$atm, AVL:$atn, AVL:$atk, ixlenimm:$sew,
+ ixlenimm:$twiden), []> {
+ let mayLoad = 0;
+ let mayStore = 0;
+ let HasTmOp = 1;
+ let HasVLOp = 1; // Tn
+ let HasTkOp = 1;
+ let HasSEWOp = 1;
+ let HasRoundModeOp = 1;
+ let hasPostISelHook = 1;
+ let HasTWidenOp = 1;
+ let hasSideEffects = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+let Defs = [VL, VTYPE] in {
+ def PseudoSF_VSETTNT
+ : Pseudo<(outs GPR:$rd),
+ (ins GPRNoX0:$rs1, XSfmmVTypeOp:$vtypei), []>,
+ PseudoInstExpansion<(VSETVLI GPR:$rd, GPR:$rs1, VTypeIOp11:$vtypei)>,
+ Sched<[WriteVSETVLI, ReadVSETVLI]>;
+ def PseudoSF_VSETTNTX0
+ : Pseudo<(outs GPRNoX0:$rd),
+ (ins GPRX0:$rs1, XSfmmVTypeOp:$vtypei), []>,
+ PseudoInstExpansion<(VSETVLI GPR:$rd, GPR:$rs1, VTypeIOp11:$vtypei)>,
+ Sched<[WriteVSETVLI, ReadVSETVLI]>;
+ def PseudoSF_VSETTNTX0X0
+ : Pseudo<(outs GPRX0:$rd),
+ (ins GPRX0:$rs1, XSfmmVTypeOp:$vtypei), []>,
+ PseudoInstExpansion<(VSETVLI GPR:$rd, GPR:$rs1, VTypeIOp11:$vtypei)>,
+ Sched<[WriteVSETVLI, ReadVSETVLI]>;
+}
+
+let Defs = [VTYPE], Uses = [VTYPE], HasTWidenOp = 1, HasSEWOp = 1 in {
+ def PseudoSF_VSETTM
+ : Pseudo<(outs GPR:$rd),
+ (ins GPR:$rs1, ixlenimm:$log2sew, ixlenimm:$twiden), []>,
+ PseudoInstExpansion<(SF_VSETTM GPR:$rd, GPR:$rs1)>,
+ Sched<[WriteVSETVLI, ReadVSETVLI]>;
+ def PseudoSF_VSETTK
+ : Pseudo<(outs GPR:$rd),
+ (ins GPR:$rs1, ixlenimm:$logwsew, ixlenimm:$twiden), []>,
+ PseudoInstExpansion<(SF_VSETTK GPR:$rd, GPR:$rs1)>,
+ Sched<[WriteVSETVLI, ReadVSETVLI]>;
+}
+}
+
+foreach eew = [8, 16, 32, 64] in {
+ def PseudoSF_VLTE # eew : VPseudoSF_VTileLoad;
+ def PseudoSF_VSTE # eew : VPseudoSF_VTileStore;
+}
+
+def PseudoSF_VTMV_T_V : VPseudoSF_VTileMove_T_V;
+def PseudoSF_VTMV_V_T : VPseudoSF_VTileMove_V_T;
+
+foreach a = I8Encodes in
+ foreach b = I8Encodes in
+ def PseudoSF_MM_ # !toupper(a.Name) # _ # !toupper(b.Name)
+ : VPseudoSF_MatMul<TRM4>;
+
+let AltFmtType = IS_NOT_ALTFMT in
+ def PseudoSF_MM_F_F : VPseudoSF_MatMul_FRM<TRM2>;
+let AltFmtType = IS_ALTFMT in
+ def PseudoSF_MM_F_F_ALT : VPseudoSF_MatMul_FRM<TRM2>;
+
+foreach e1 = [5, 4] in
+ foreach e2 = [5, 4] in
+ def PseudoSF_MM_E # e1 # M # !sub(7, e1) # _E # e2 # M # !sub(7, e2)
+ : VPseudoSF_MatMul_FRM<TRM4>;
+
+let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in {
+ let HasVLOp = 1, HasTmOp = 1, HasTWidenOp = 1, HasSEWOp = 1 in
+ def PseudoSF_VTZERO_T
+ : RISCVVPseudo<(outs),
+ (ins TR:$rd, AVL:$atm, AVL:$atn, ixlenimm:$sew,
+ ixlenimm:$twiden)>;
+ def PseudoSF_VTDISCARD : RISCVVPseudo<(outs), (ins), []>;
+}
+
+class VPatXSfmmTileStore<string intrinsic_name,
+ string inst_name,
+ int log2sew> :
+ Pat<(!cast<Intrinsic>(intrinsic_name)
+ (XLenVT GPR:$rs2),
+ (XLenVT GPR:$rs1),
+ (XLenVT AVL:$tn)),
+ (!cast<Instruction>(inst_name)
+ (XLenVT GPR:$rs2),
+ (XLenVT GPR:$rs1),
+ GPR:$tn, log2sew, 1)>;
+
+class VPatXSfmmTileMove_T_V<string intrinsic_name,
+ string inst_name,
+ ValueType reg_type,
+ int log2sew> :
+ Pat<(!cast<Intrinsic>(intrinsic_name)
+ (XLenVT GPR:$rs1),
+ (reg_type VRM8:$vs2),
+ (XLenVT AVL:$atn)),
+ (!cast<Instruction>(inst_name)
+ (XLenVT GPR:$rs1),
+ (reg_type VRM8:$vs2),
+ GPR:$atn, log2sew, 1)>;
+
+class VPatXSfmmTileMove_V_T<string intrinsic_name,
+ string inst_name,
+ ValueType result_type,
+ int log2sew> :
+ Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
+ (XLenVT GPR:$rs1),
+ (XLenVT AVL:$atn))),
+ (!cast<Instruction>(inst_name)
+ (XLenVT GPR:$rs1),
+ GPR:$atn, log2sew, 1)>;
+
+class VPatXSfmmVTDiscard<string intrinsic_name,
+ string inst_name> :
+ Pat<(!cast<Intrinsic>(intrinsic_name)),
+ (!cast<Instruction>(inst_name))>;
+
+foreach eew = [8, 16, 32, 64] in
+ def : VPatXSfmmTileStore<"int_riscv_sf_vste" # eew, "PseudoSF_VSTE" # eew, !logtwo(eew)>;
+
+foreach vti = [VI8M8, VI16M8, VI32M8, VI64M8, VF16M8, VF32M8, VF64M8, VBF16M8] in {
+ def : VPatXSfmmTileMove_T_V<"int_riscv_sf_vtmv_t_v", "PseudoSF_VTMV_T_V", vti.Vector, vti.Log2SEW>;
+ def : VPatXSfmmTileMove_V_T<"int_riscv_sf_vtmv_v_t", "PseudoSF_VTMV_V_T", vti.Vector, vti.Log2SEW>;
+}
+
+def : VPatXSfmmVTDiscard<"int_riscv_sf_vtdiscard", "PseudoSF_VTDISCARD">;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td
index 0be9eab..c9c1246 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td
@@ -36,7 +36,7 @@ defm VFWMACCBF16_V : VWMAC_FV_V_F<"vfwmaccbf16", 0b111011>;
//===----------------------------------------------------------------------===//
// Pseudo instructions
//===----------------------------------------------------------------------===//
-let Predicates = [HasStdExtZvfbfminOrZvfofp8min] in {
+let Predicates = [HasStdExtZvfbfmin] in {
defm PseudoVFWCVTBF16_F_F : VPseudoVWCVTD_V;
defm PseudoVFNCVTBF16_F_F : VPseudoVNCVTD_W_RM;
}
@@ -47,7 +47,31 @@ let mayRaiseFPException = true, Predicates = [HasStdExtZvfbfwma] in
//===----------------------------------------------------------------------===//
// Patterns
//===----------------------------------------------------------------------===//
-let Predicates = [HasStdExtZvfbfminOrZvfofp8min] in {
+multiclass VPatConversionWF_VF_BF<string intrinsic, string instruction,
+ bit isSEWAware = 0> {
+ foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in
+ {
+ defvar fvti = fvtiToFWti.Vti;
+ defvar fwti = fvtiToFWti.Wti;
+ defm : VPatConversion<intrinsic, instruction, "V",
+ fwti.Vector, fvti.Vector, fwti.Mask, fvti.Log2SEW,
+ fvti.LMul, fwti.RegClass, fvti.RegClass, isSEWAware>;
+ }
+}
+
+multiclass VPatConversionVF_WF_BF_RM<string intrinsic, string instruction,
+ bit isSEWAware = 0> {
+ foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in {
+ defvar fvti = fvtiToFWti.Vti;
+ defvar fwti = fvtiToFWti.Wti;
+ defm : VPatConversionRoundingMode<intrinsic, instruction, "W",
+ fvti.Vector, fwti.Vector, fvti.Mask, fvti.Log2SEW,
+ fvti.LMul, fvti.RegClass, fwti.RegClass,
+ isSEWAware>;
+ }
+}
+
+let Predicates = [HasStdExtZvfbfmin] in {
defm : VPatConversionWF_VF_BF<"int_riscv_vfwcvtbf16_f_f_v",
"PseudoVFWCVTBF16_F_F", isSEWAware=1>;
defm : VPatConversionVF_WF_BF_RM<"int_riscv_vfncvtbf16_f_f_w",
@@ -56,7 +80,6 @@ let Predicates = [HasStdExtZvfbfminOrZvfofp8min] in {
foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in {
defvar fvti = fvtiToFWti.Vti;
defvar fwti = fvtiToFWti.Wti;
- let Predicates = [HasVInstructionsBF16Minimal] in
def : Pat<(fwti.Vector (any_riscv_fpextend_vl
(fvti.Vector fvti.RegClass:$rs1),
(fvti.Mask VMV0:$vm),
@@ -66,18 +89,16 @@ let Predicates = [HasStdExtZvfbfminOrZvfofp8min] in {
(fvti.Mask VMV0:$vm),
GPR:$vl, fvti.Log2SEW, TA_MA)>;
- let Predicates = [HasVInstructionsBF16Minimal] in
- def : Pat<(fvti.Vector (any_riscv_fpround_vl
- (fwti.Vector fwti.RegClass:$rs1),
- (fwti.Mask VMV0:$vm), VLOpFrag)),
- (!cast<Instruction>("PseudoVFNCVTBF16_F_F_W_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK")
- (fvti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1,
- (fwti.Mask VMV0:$vm),
- // Value to indicate no rounding mode change in
- // RISCVInsertReadWriteCSR
- FRM_DYN,
- GPR:$vl, fvti.Log2SEW, TA_MA)>;
- let Predicates = [HasVInstructionsBF16Minimal] in
+ def : Pat<(fvti.Vector (any_riscv_fpround_vl
+ (fwti.Vector fwti.RegClass:$rs1),
+ (fwti.Mask VMV0:$vm), VLOpFrag)),
+ (!cast<Instruction>("PseudoVFNCVTBF16_F_F_W_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK")
+ (fvti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1,
+ (fwti.Mask VMV0:$vm),
+ // Value to indicate no rounding mode change in
+ // RISCVInsertReadWriteCSR
+ FRM_DYN,
+ GPR:$vl, fvti.Log2SEW, TA_MA)>;
def : Pat<(fvti.Vector (fpround (fwti.Vector fwti.RegClass:$rs1))),
(!cast<Instruction>("PseudoVFNCVTBF16_F_F_W_"#fvti.LMul.MX#"_E"#fvti.SEW)
(fvti.Vector (IMPLICIT_DEF)),
diff --git a/llvm/lib/Target/RISCV/RISCVInstrPredicates.td b/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
index 3658817..dcae977 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
@@ -78,7 +78,41 @@ def isVectorConfigInstr
PseudoVSETVLI,
PseudoVSETVLIX0,
PseudoVSETVLIX0X0,
- PseudoVSETIVLI
+ PseudoVSETIVLI,
+ PseudoSF_VSETTNT,
+ PseudoSF_VSETTNTX0,
+ PseudoSF_VSETTNTX0X0
+ ]>>>;
+
+// Returns true if this is a PseudoSF_VSETTNT* instructions.
+def isXSfmmVectorConfigTNInstr
+ : TIIPredicate<"isXSfmmVectorConfigTNInstr",
+ MCReturnStatement<
+ CheckOpcode<[
+ PseudoSF_VSETTNT,
+ PseudoSF_VSETTNTX0,
+ PseudoSF_VSETTNTX0X0
+ ]>>>;
+
+// Returns true if this is PseudoSF_VSETTM or PseudoSF_VSETTK.
+def isXSfmmVectorConfigTMTKInstr
+ : TIIPredicate<"isXSfmmVectorConfigTMTKInstr",
+ MCReturnStatement<
+ CheckOpcode<[
+ PseudoSF_VSETTM,
+ PseudoSF_VSETTK
+ ]>>>;
+
+// Returns true if this is a XSfmm vector configuration instruction.
+def isXSfmmVectorConfigInstr
+ : TIIPredicate<"isXSfmmVectorConfigInstr",
+ MCReturnStatement<
+ CheckOpcode<[
+ PseudoSF_VSETTNT,
+ PseudoSF_VSETTNTX0,
+ PseudoSF_VSETTNTX0X0,
+ PseudoSF_VSETTM,
+ PseudoSF_VSETTK
]>>>;
// Return true if this is 'vsetvli x0, x0, vtype' which preserves
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index 40b6416..e9f43b9 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -178,6 +178,10 @@ BitVector RISCVRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
// Shadow stack pointer.
markSuperRegs(Reserved, RISCV::SSP);
+ // XSfmmbase
+ for (MCPhysReg Reg = RISCV::T0; Reg <= RISCV::T15; Reg++)
+ markSuperRegs(Reserved, Reg);
+
assert(checkAllSuperRegsMarked(Reserved));
return Reserved;
}
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
index f863392a..637d61fe 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -270,7 +270,7 @@ class SiFive7AnyToGPRBypass<SchedRead read, int cycles = 2>
// and floating point computation.
// The V pipeline is modeled by the VCQ, VA, VL, and VS resources. There can
// be one or two VA (Vector Arithmetic).
-multiclass SiFive7ProcResources<bit extraVALU = false> {
+multiclass SiFive7ProcResources<bit dualVALU = false> {
let BufferSize = 0 in {
def PipeA : ProcResource<1>;
def PipeB : ProcResource<1>;
@@ -279,7 +279,7 @@ multiclass SiFive7ProcResources<bit extraVALU = false> {
def FDiv : ProcResource<1>; // FP Division/Sqrt
// Arithmetic sequencer(s)
- if extraVALU then {
+ if dualVALU then {
// VA1 can handle any vector airthmetic instruction.
def VA1 : ProcResource<1>;
// VA2 generally can only handle simple vector arithmetic.
@@ -305,7 +305,7 @@ multiclass SiFive7ProcResources<bit extraVALU = false> {
def PipeAB : ProcResGroup<[!cast<ProcResource>(NAME#"PipeA"),
!cast<ProcResource>(NAME#"PipeB")]>;
- if extraVALU then
+ if dualVALU then
def VA1OrVA2 : ProcResGroup<[!cast<ProcResource>(NAME#"VA1"),
!cast<ProcResource>(NAME#"VA2")]>;
}
@@ -1550,10 +1550,10 @@ multiclass SiFive7ReadAdvance {
/// This multiclass is a "bundle" of (1) processor resources (i.e. pipes) and
/// (2) WriteRes entries. It's parameterized by config values that will
/// eventually be supplied by different SchedMachineModels.
-multiclass SiFive7SchedResources<int vlen, bit extraVALU,
+multiclass SiFive7SchedResources<int vlen, bit dualVALU,
SiFive7FPLatencies fpLatencies,
bit hasFastGather> {
- defm SiFive7 : SiFive7ProcResources<extraVALU>;
+ defm SiFive7 : SiFive7ProcResources<dualVALU>;
// Pull out defs from SiFive7ProcResources so we can refer to them by name.
defvar SiFive7PipeA = !cast<ProcResource>(NAME # SiFive7PipeA);
@@ -1562,10 +1562,10 @@ multiclass SiFive7SchedResources<int vlen, bit extraVALU,
defvar SiFive7IDiv = !cast<ProcResource>(NAME # SiFive7IDiv);
defvar SiFive7FDiv = !cast<ProcResource>(NAME # SiFive7FDiv);
// Pass SiFive7VA for VA1 and VA1OrVA2 if there is only 1 VALU.
- defvar SiFive7VA1 = !if (extraVALU,
+ defvar SiFive7VA1 = !if (dualVALU,
!cast<ProcResource>(NAME # SiFive7VA1),
!cast<ProcResource>(NAME # SiFive7VA));
- defvar SiFive7VA1OrVA2 = !if (extraVALU,
+ defvar SiFive7VA1OrVA2 = !if (dualVALU,
!cast<ProcResGroup>(NAME # SiFive7VA1OrVA2),
!cast<ProcResource>(NAME # SiFive7VA));
defvar SiFive7VA = !cast<ProcResource>(NAME # SiFive7VA);
@@ -1608,7 +1608,7 @@ class SiFive7SchedMachineModel<int vlen> : SchedMachineModel {
HasStdExtZknh, HasStdExtZksed, HasStdExtZksh,
HasStdExtZkr];
int VLEN = vlen;
- bit HasExtraVALU = false;
+ bit HasDualVALU = false;
SiFive7FPLatencies FPLatencies;
bit HasFastGather = false;
@@ -1635,7 +1635,7 @@ def SiFive7VLEN512Model : SiFive7SchedMachineModel<512> {
}
def SiFive7VLEN1024X300Model : SiFive7SchedMachineModel<1024> {
- let HasExtraVALU = true;
+ let HasDualVALU = true;
let FPLatencies = SiFive7LowFPLatencies;
let HasFastGather = true;
}
@@ -1643,7 +1643,7 @@ def SiFive7VLEN1024X300Model : SiFive7SchedMachineModel<1024> {
/// Binding models to their scheduling resources.
foreach model = [SiFive7VLEN512Model, SiFive7VLEN1024X300Model] in {
let SchedModel = model in
- defm model.Name : SiFive7SchedResources<model.VLEN, model.HasExtraVALU,
+ defm model.Name : SiFive7SchedResources<model.VLEN, model.HasDualVALU,
model.FPLatencies,
model.HasFastGather>;
}
diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index 96ad5c6..0a8838c 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -156,13 +156,13 @@ FunctionPass *llvm::createRISCVVLOptimizerPass() {
return new RISCVVLOptimizer();
}
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
static raw_ostream &operator<<(raw_ostream &OS, const OperandInfo &OI) {
OI.print(OS);
return OS;
}
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
static raw_ostream &operator<<(raw_ostream &OS,
const std::optional<OperandInfo> &OI) {
if (OI)
diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
index b765fec..640b014 100644
--- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
@@ -78,6 +78,8 @@ public:
void outputExecutionModeFromNumthreadsAttribute(
const MCRegister &Reg, const Attribute &Attr,
SPIRV::ExecutionMode::ExecutionMode EM);
+ void outputExecutionModeFromEnableMaximalReconvergenceAttr(
+ const MCRegister &Reg, const SPIRVSubtarget &ST);
void outputExecutionMode(const Module &M);
void outputAnnotations(const Module &M);
void outputModuleSections();
@@ -139,8 +141,8 @@ void SPIRVAsmPrinter::emitEndOfAsmFile(Module &M) {
// anymore.
void SPIRVAsmPrinter::cleanUp(Module &M) {
// Verifier disallows uses of intrinsic global variables.
- for (StringRef GVName : {"llvm.global_ctors", "llvm.global_dtors",
- "llvm.used", "llvm.compiler.used"}) {
+ for (StringRef GVName :
+ {"llvm.global_ctors", "llvm.global_dtors", "llvm.used"}) {
if (GlobalVariable *GV = M.getNamedGlobal(GVName))
GV->setName("");
}
@@ -495,6 +497,20 @@ void SPIRVAsmPrinter::outputExecutionModeFromNumthreadsAttribute(
outputMCInst(Inst);
}
+void SPIRVAsmPrinter::outputExecutionModeFromEnableMaximalReconvergenceAttr(
+ const MCRegister &Reg, const SPIRVSubtarget &ST) {
+ assert(ST.canUseExtension(SPIRV::Extension::SPV_KHR_maximal_reconvergence) &&
+ "Function called when SPV_KHR_maximal_reconvergence is not enabled.");
+
+ MCInst Inst;
+ Inst.setOpcode(SPIRV::OpExecutionMode);
+ Inst.addOperand(MCOperand::createReg(Reg));
+ unsigned EM =
+ static_cast<unsigned>(SPIRV::ExecutionMode::MaximallyReconvergesKHR);
+ Inst.addOperand(MCOperand::createImm(EM));
+ outputMCInst(Inst);
+}
+
void SPIRVAsmPrinter::outputExecutionMode(const Module &M) {
NamedMDNode *Node = M.getNamedMetadata("spirv.ExecutionMode");
if (Node) {
@@ -551,6 +567,10 @@ void SPIRVAsmPrinter::outputExecutionMode(const Module &M) {
if (Attribute Attr = F.getFnAttribute("hlsl.numthreads"); Attr.isValid())
outputExecutionModeFromNumthreadsAttribute(
FReg, Attr, SPIRV::ExecutionMode::LocalSize);
+ if (Attribute Attr = F.getFnAttribute("enable-maximal-reconvergence");
+ Attr.getValueAsBool()) {
+ outputExecutionModeFromEnableMaximalReconvergenceAttr(FReg, *ST);
+ }
if (MDNode *Node = F.getMetadata("work_group_size_hint"))
outputExecutionModeFromMDNode(FReg, Node,
SPIRV::ExecutionMode::LocalSizeHint, 3, 1);
diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
index 5f3ed86..96f5dee 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
@@ -153,7 +153,9 @@ static const std::map<std::string, SPIRV::Extension::Extension, std::less<>>
SPIRV::Extension::Extension::
SPV_EXT_relaxed_printf_string_address_space},
{"SPV_INTEL_predicated_io",
- SPIRV::Extension::Extension::SPV_INTEL_predicated_io}};
+ SPIRV::Extension::Extension::SPV_INTEL_predicated_io},
+ {"SPV_KHR_maximal_reconvergence",
+ SPIRV::Extension::Extension::SPV_KHR_maximal_reconvergence}};
bool SPIRVExtensionsParser::parse(cl::Option &O, StringRef ArgName,
StringRef ArgValue,
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index e16c8f0..a151fd2 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -17,6 +17,7 @@
#include "SPIRVTargetMachine.h"
#include "SPIRVUtils.h"
#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/StringSet.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/InstVisitor.h"
@@ -1391,19 +1392,19 @@ void SPIRVEmitIntrinsics::preprocessCompositeConstants(IRBuilder<> &B) {
Constant *AggrConst = nullptr;
Type *ResTy = nullptr;
if (auto *COp = dyn_cast<ConstantVector>(Op)) {
- AggrConst = cast<Constant>(COp);
+ AggrConst = COp;
ResTy = COp->getType();
} else if (auto *COp = dyn_cast<ConstantArray>(Op)) {
- AggrConst = cast<Constant>(COp);
+ AggrConst = COp;
ResTy = B.getInt32Ty();
} else if (auto *COp = dyn_cast<ConstantStruct>(Op)) {
- AggrConst = cast<Constant>(COp);
+ AggrConst = COp;
ResTy = B.getInt32Ty();
} else if (auto *COp = dyn_cast<ConstantDataArray>(Op)) {
- AggrConst = cast<Constant>(COp);
+ AggrConst = COp;
ResTy = B.getInt32Ty();
} else if (auto *COp = dyn_cast<ConstantAggregateZero>(Op)) {
- AggrConst = cast<Constant>(COp);
+ AggrConst = COp;
ResTy = Op->getType()->isVectorTy() ? COp->getType() : B.getInt32Ty();
}
if (AggrConst) {
@@ -2028,9 +2029,13 @@ Instruction *SPIRVEmitIntrinsics::visitUnreachableInst(UnreachableInst &I) {
void SPIRVEmitIntrinsics::processGlobalValue(GlobalVariable &GV,
IRBuilder<> &B) {
- // Skip special artifical variable llvm.global.annotations.
- if (GV.getName() == "llvm.global.annotations")
+ // Skip special artificial variables.
+ static const StringSet<> ArtificialGlobals{"llvm.global.annotations",
+ "llvm.compiler.used"};
+
+ if (ArtificialGlobals.contains(GV.getName()))
return;
+
Constant *Init = nullptr;
if (hasInitializer(&GV)) {
// Deduce element type and store results in Global Registry.
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index a466ab2..a0cff4d 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -3765,7 +3765,6 @@ void SPIRVInstructionSelector::decorateUsesAsNonUniform(
SPIRV::Decoration::NonUniformEXT, {});
}
}
- return;
}
bool SPIRVInstructionSelector::extractSubvector(
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index 5144fb1..61a0bbe 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -1200,6 +1200,23 @@ void addOpAccessChainReqs(const MachineInstr &Instr,
return;
}
+ bool IsNonUniform =
+ hasNonUniformDecoration(Instr.getOperand(0).getReg(), MRI);
+
+ auto FirstIndexReg = Instr.getOperand(3).getReg();
+ bool FirstIndexIsConstant =
+ Subtarget.getInstrInfo()->isConstantInstr(*MRI.getVRegDef(FirstIndexReg));
+
+ if (StorageClass == SPIRV::StorageClass::StorageClass::StorageBuffer) {
+ if (IsNonUniform)
+ Handler.addRequirements(
+ SPIRV::Capability::StorageBufferArrayNonUniformIndexingEXT);
+ else if (!FirstIndexIsConstant)
+ Handler.addRequirements(
+ SPIRV::Capability::StorageBufferArrayDynamicIndexing);
+ return;
+ }
+
Register PointeeTypeReg = ResTypeInst->getOperand(2).getReg();
MachineInstr *PointeeType = MRI.getUniqueVRegDef(PointeeTypeReg);
if (PointeeType->getOpcode() != SPIRV::OpTypeImage &&
@@ -1208,27 +1225,25 @@ void addOpAccessChainReqs(const MachineInstr &Instr,
return;
}
- bool IsNonUniform =
- hasNonUniformDecoration(Instr.getOperand(0).getReg(), MRI);
if (isUniformTexelBuffer(PointeeType)) {
if (IsNonUniform)
Handler.addRequirements(
SPIRV::Capability::UniformTexelBufferArrayNonUniformIndexingEXT);
- else
+ else if (!FirstIndexIsConstant)
Handler.addRequirements(
SPIRV::Capability::UniformTexelBufferArrayDynamicIndexingEXT);
} else if (isInputAttachment(PointeeType)) {
if (IsNonUniform)
Handler.addRequirements(
SPIRV::Capability::InputAttachmentArrayNonUniformIndexingEXT);
- else
+ else if (!FirstIndexIsConstant)
Handler.addRequirements(
SPIRV::Capability::InputAttachmentArrayDynamicIndexingEXT);
} else if (isStorageTexelBuffer(PointeeType)) {
if (IsNonUniform)
Handler.addRequirements(
SPIRV::Capability::StorageTexelBufferArrayNonUniformIndexingEXT);
- else
+ else if (!FirstIndexIsConstant)
Handler.addRequirements(
SPIRV::Capability::StorageTexelBufferArrayDynamicIndexingEXT);
} else if (isSampledImage(PointeeType) ||
@@ -1237,14 +1252,14 @@ void addOpAccessChainReqs(const MachineInstr &Instr,
if (IsNonUniform)
Handler.addRequirements(
SPIRV::Capability::SampledImageArrayNonUniformIndexingEXT);
- else
+ else if (!FirstIndexIsConstant)
Handler.addRequirements(
SPIRV::Capability::SampledImageArrayDynamicIndexing);
} else if (isStorageImage(PointeeType)) {
if (IsNonUniform)
Handler.addRequirements(
SPIRV::Capability::StorageImageArrayNonUniformIndexingEXT);
- else
+ else if (!FirstIndexIsConstant)
Handler.addRequirements(
SPIRV::Capability::StorageImageArrayDynamicIndexing);
}
@@ -2155,6 +2170,9 @@ static void collectReqs(const Module &M, SPIRV::ModuleAnalysisInfo &MAI,
SPIRV::OperandCategory::ExecutionModeOperand,
SPIRV::ExecutionMode::LocalSize, ST);
}
+ if (F.getFnAttribute("enable-maximal-reconvergence").getValueAsBool()) {
+ MAI.Reqs.addExtension(SPIRV::Extension::SPV_KHR_maximal_reconvergence);
+ }
if (F.getMetadata("work_group_size_hint"))
MAI.Reqs.getAndAddRequirements(
SPIRV::OperandCategory::ExecutionModeOperand,
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index 2625642..7d08b29 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -386,6 +386,7 @@ defm SPV_KHR_float_controls2 : ExtensionOperand<124, [EnvVulkan, EnvOpenCL]>;
defm SPV_INTEL_tensor_float32_conversion : ExtensionOperand<125, [EnvOpenCL]>;
defm SPV_KHR_bfloat16 : ExtensionOperand<126, [EnvVulkan, EnvOpenCL]>;
defm SPV_INTEL_predicated_io : ExtensionOperand<127, [EnvOpenCL]>;
+defm SPV_KHR_maximal_reconvergence : ExtensionOperand<128, [EnvVulkan]>;
//===----------------------------------------------------------------------===//
// Multiclass used to define Capabilities enum values and at the same time
@@ -698,7 +699,7 @@ defm IntersectionNV: ExecutionModelOperand<5314, [RayTracingNV]>;
defm AnyHitNV: ExecutionModelOperand<5315, [RayTracingNV]>;
defm ClosestHitNV: ExecutionModelOperand<5316, [RayTracingNV]>;
defm MissNV: ExecutionModelOperand<5317, [RayTracingNV]>;
-defm CallableNV: ExecutionModelOperand<5318, [RayTracingNV]>;
+defm CallableNV : ExecutionModelOperand<5318, [RayTracingNV]>;
//===----------------------------------------------------------------------===//
// Multiclass used to define MemoryModel enum values and at the same time
@@ -805,6 +806,7 @@ defm RoundingModeRTNINTEL : ExecutionModeOperand<5621, [RoundToInfinityINTEL]>;
defm FloatingPointModeALTINTEL : ExecutionModeOperand<5622, [FloatingPointModeINTEL]>;
defm FloatingPointModeIEEEINTEL : ExecutionModeOperand<5623, [FloatingPointModeINTEL]>;
defm FPFastMathDefault : ExecutionModeOperand<6028, [FloatControls2]>;
+defm MaximallyReconvergesKHR : ExecutionModeOperand<6023, [Shader]>;
//===----------------------------------------------------------------------===//
// Multiclass used to define StorageClass enum values and at the same time
diff --git a/llvm/lib/Target/Sparc/SparcFrameLowering.cpp b/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
index 2934c88..fa08d44 100644
--- a/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
@@ -246,8 +246,7 @@ SparcFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
}
}
-static bool LLVM_ATTRIBUTE_UNUSED verifyLeafProcRegUse(MachineRegisterInfo *MRI)
-{
+[[maybe_unused]] static bool verifyLeafProcRegUse(MachineRegisterInfo *MRI) {
for (unsigned reg = SP::I0; reg <= SP::I7; ++reg)
if (MRI->isPhysRegUsed(reg))
diff --git a/llvm/lib/Target/SystemZ/SystemZ.h b/llvm/lib/Target/SystemZ/SystemZ.h
index a0cf881..5a06ea3 100644
--- a/llvm/lib/Target/SystemZ/SystemZ.h
+++ b/llvm/lib/Target/SystemZ/SystemZ.h
@@ -24,6 +24,7 @@ class SystemZTargetMachine;
namespace SystemZ {
// Condition-code mask values.
+const unsigned CCMASK_NONE = 0;
const unsigned CCMASK_0 = 1 << 3;
const unsigned CCMASK_1 = 1 << 2;
const unsigned CCMASK_2 = 1 << 1;
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 3b7d11a..de28faf 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -15,6 +15,7 @@
#include "SystemZConstantPoolValue.h"
#include "SystemZMachineFunctionInfo.h"
#include "SystemZTargetMachine.h"
+#include "llvm/ADT/SmallSet.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -24,6 +25,7 @@
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsS390.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
@@ -1514,6 +1516,9 @@ SystemZTargetLowering::getConstraintType(StringRef Constraint) const {
default:
break;
}
+ } else if (Constraint.size() == 5 && Constraint.starts_with("{")) {
+ if (StringRef("{@cc}").compare(Constraint) == 0)
+ return C_Other;
}
return TargetLowering::getConstraintType(Constraint);
}
@@ -1707,6 +1712,10 @@ SystemZTargetLowering::getRegForInlineAsmConstraint(
return parseRegisterNumber(Constraint, &SystemZ::VR128BitRegClass,
SystemZMC::VR128Regs, 32);
}
+ if (Constraint[1] == '@') {
+ if (StringRef("{@cc}").compare(Constraint) == 0)
+ return std::make_pair(0u, &SystemZ::GR32BitRegClass);
+ }
}
return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
}
@@ -1737,6 +1746,38 @@ Register SystemZTargetLowering::getExceptionSelectorRegister(
return Subtarget.isTargetXPLINK64() ? SystemZ::R2D : SystemZ::R7D;
}
+// Convert condition code in CCReg to an i32 value.
+static SDValue getCCResult(SelectionDAG &DAG, SDValue CCReg) {
+ SDLoc DL(CCReg);
+ SDValue IPM = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, CCReg);
+ return DAG.getNode(ISD::SRL, DL, MVT::i32, IPM,
+ DAG.getConstant(SystemZ::IPM_CC, DL, MVT::i32));
+}
+
+// Lower @cc targets via setcc.
+SDValue SystemZTargetLowering::LowerAsmOutputForConstraint(
+ SDValue &Chain, SDValue &Glue, const SDLoc &DL,
+ const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
+ if (StringRef("{@cc}").compare(OpInfo.ConstraintCode) != 0)
+ return SDValue();
+
+ // Check that return type is valid.
+ if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
+ OpInfo.ConstraintVT.getSizeInBits() < 8)
+ report_fatal_error("Glue output operand is of invalid type");
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ MRI.addLiveIn(SystemZ::CC);
+
+ if (Glue.getNode()) {
+ Glue = DAG.getCopyFromReg(Chain, DL, SystemZ::CC, MVT::i32, Glue);
+ Chain = Glue.getValue(1);
+ } else
+ Glue = DAG.getCopyFromReg(Chain, DL, SystemZ::CC, MVT::i32);
+ return getCCResult(DAG, Glue);
+}
+
void SystemZTargetLowering::LowerAsmOperandForConstraint(
SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
SelectionDAG &DAG) const {
@@ -5300,14 +5341,6 @@ SDValue SystemZTargetLowering::lowerPREFETCH(SDValue Op,
Node->getMemoryVT(), Node->getMemOperand());
}
-// Convert condition code in CCReg to an i32 value.
-static SDValue getCCResult(SelectionDAG &DAG, SDValue CCReg) {
- SDLoc DL(CCReg);
- SDValue IPM = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, CCReg);
- return DAG.getNode(ISD::SRL, DL, MVT::i32, IPM,
- DAG.getConstant(SystemZ::IPM_CC, DL, MVT::i32));
-}
-
SDValue
SystemZTargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
@@ -8723,95 +8756,247 @@ SDValue SystemZTargetLowering::combineSETCC(
return SDValue();
}
-static bool combineCCMask(SDValue &CCReg, int &CCValid, int &CCMask) {
+static std::pair<SDValue, int> findCCUse(const SDValue &Val) {
+ switch (Val.getOpcode()) {
+ default:
+ return std::make_pair(SDValue(), SystemZ::CCMASK_NONE);
+ case SystemZISD::IPM:
+ if (Val.getOperand(0).getOpcode() == SystemZISD::CLC ||
+ Val.getOperand(0).getOpcode() == SystemZISD::STRCMP)
+ return std::make_pair(Val.getOperand(0), SystemZ::CCMASK_ICMP);
+ return std::make_pair(Val.getOperand(0), SystemZ::CCMASK_ANY);
+ case SystemZISD::SELECT_CCMASK: {
+ SDValue Op4CCReg = Val.getOperand(4);
+ if (Op4CCReg.getOpcode() == SystemZISD::ICMP ||
+ Op4CCReg.getOpcode() == SystemZISD::TM) {
+ auto [OpCC, OpCCValid] = findCCUse(Op4CCReg.getOperand(0));
+ if (OpCC != SDValue())
+ return std::make_pair(OpCC, OpCCValid);
+ }
+ auto *CCValid = dyn_cast<ConstantSDNode>(Val.getOperand(2));
+ if (!CCValid)
+ return std::make_pair(SDValue(), SystemZ::CCMASK_NONE);
+ int CCValidVal = CCValid->getZExtValue();
+ return std::make_pair(Op4CCReg, CCValidVal);
+ }
+ case ISD::ADD:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR:
+ case ISD::SHL:
+ case ISD::SRA:
+ case ISD::SRL:
+ auto [Op0CC, Op0CCValid] = findCCUse(Val.getOperand(0));
+ if (Op0CC != SDValue())
+ return std::make_pair(Op0CC, Op0CCValid);
+ return findCCUse(Val.getOperand(1));
+ }
+}
+
+static bool combineCCMask(SDValue &CCReg, int &CCValid, int &CCMask,
+ SelectionDAG &DAG);
+
+SmallVector<SDValue, 4> static simplifyAssumingCCVal(SDValue &Val, SDValue &CC,
+ SelectionDAG &DAG) {
+ SDLoc DL(Val);
+ auto Opcode = Val.getOpcode();
+ switch (Opcode) {
+ default:
+ return {};
+ case ISD::Constant:
+ return {Val, Val, Val, Val};
+ case SystemZISD::IPM: {
+ SDValue IPMOp0 = Val.getOperand(0);
+ if (IPMOp0 != CC)
+ return {};
+ SmallVector<SDValue, 4> ShiftedCCVals;
+ for (auto CC : {0, 1, 2, 3})
+ ShiftedCCVals.emplace_back(
+ DAG.getConstant((CC << SystemZ::IPM_CC), DL, MVT::i32));
+ return ShiftedCCVals;
+ }
+ case SystemZISD::SELECT_CCMASK: {
+ SDValue TrueVal = Val.getOperand(0), FalseVal = Val.getOperand(1);
+ auto *CCValid = dyn_cast<ConstantSDNode>(Val.getOperand(2));
+ auto *CCMask = dyn_cast<ConstantSDNode>(Val.getOperand(3));
+ if (!CCValid || !CCMask)
+ return {};
+
+ int CCValidVal = CCValid->getZExtValue();
+ int CCMaskVal = CCMask->getZExtValue();
+ const auto &&TrueSDVals = simplifyAssumingCCVal(TrueVal, CC, DAG);
+ const auto &&FalseSDVals = simplifyAssumingCCVal(FalseVal, CC, DAG);
+ if (TrueSDVals.empty() || FalseSDVals.empty())
+ return {};
+ SDValue Op4CCReg = Val.getOperand(4);
+ if (Op4CCReg != CC)
+ combineCCMask(Op4CCReg, CCValidVal, CCMaskVal, DAG);
+ if (Op4CCReg != CC)
+ return {};
+ SmallVector<SDValue, 4> MergedSDVals;
+ for (auto &CCVal : {0, 1, 2, 3})
+ MergedSDVals.emplace_back(((CCMaskVal & (1 << (3 - CCVal))) != 0)
+ ? TrueSDVals[CCVal]
+ : FalseSDVals[CCVal]);
+ return MergedSDVals;
+ }
+ case ISD::ADD:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR:
+ case ISD::SRA:
+ // Avoid introducing CC spills (because ADD/AND/OR/XOR/SRA
+ // would clobber CC).
+ if (!Val.hasOneUse())
+ return {};
+ [[fallthrough]];
+ case ISD::SHL:
+ case ISD::SRL:
+ SDValue Op0 = Val.getOperand(0), Op1 = Val.getOperand(1);
+ const auto &&Op0SDVals = simplifyAssumingCCVal(Op0, CC, DAG);
+ const auto &&Op1SDVals = simplifyAssumingCCVal(Op1, CC, DAG);
+ if (Op0SDVals.empty() || Op1SDVals.empty())
+ return {};
+ SmallVector<SDValue, 4> BinaryOpSDVals;
+ for (auto CCVal : {0, 1, 2, 3})
+ BinaryOpSDVals.emplace_back(DAG.getNode(
+ Opcode, DL, Val.getValueType(), Op0SDVals[CCVal], Op1SDVals[CCVal]));
+ return BinaryOpSDVals;
+ }
+}
+
+static bool combineCCMask(SDValue &CCReg, int &CCValid, int &CCMask,
+ SelectionDAG &DAG) {
// We have a SELECT_CCMASK or BR_CCMASK comparing the condition code
// set by the CCReg instruction using the CCValid / CCMask masks,
- // If the CCReg instruction is itself a ICMP testing the condition
+ // If the CCReg instruction is itself a ICMP / TM testing the condition
// code set by some other instruction, see whether we can directly
// use that condition code.
-
- // Verify that we have an ICMP against some constant.
- if (CCValid != SystemZ::CCMASK_ICMP)
- return false;
- auto *ICmp = CCReg.getNode();
- if (ICmp->getOpcode() != SystemZISD::ICMP)
- return false;
- auto *CompareLHS = ICmp->getOperand(0).getNode();
- auto *CompareRHS = dyn_cast<ConstantSDNode>(ICmp->getOperand(1));
- if (!CompareRHS)
+ auto *CCNode = CCReg.getNode();
+ if (!CCNode)
return false;
- // Optimize the case where CompareLHS is a SELECT_CCMASK.
- if (CompareLHS->getOpcode() == SystemZISD::SELECT_CCMASK) {
- // Verify that we have an appropriate mask for a EQ or NE comparison.
- bool Invert = false;
- if (CCMask == SystemZ::CCMASK_CMP_NE)
- Invert = !Invert;
- else if (CCMask != SystemZ::CCMASK_CMP_EQ)
+ if (CCNode->getOpcode() == SystemZISD::TM) {
+ if (CCValid != SystemZ::CCMASK_TM)
return false;
-
- // Verify that the ICMP compares against one of select values.
- auto *TrueVal = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(0));
- if (!TrueVal)
- return false;
- auto *FalseVal = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(1));
- if (!FalseVal)
+ auto emulateTMCCMask = [](const SDValue &Op0Val, const SDValue &Op1Val) {
+ auto *Op0Node = dyn_cast<ConstantSDNode>(Op0Val.getNode());
+ auto *Op1Node = dyn_cast<ConstantSDNode>(Op1Val.getNode());
+ if (!Op0Node || !Op1Node)
+ return -1;
+ auto Op0APVal = Op0Node->getAPIntValue();
+ auto Op1APVal = Op1Node->getAPIntValue();
+ auto Result = Op0APVal & Op1APVal;
+ bool AllOnes = Result == Op1APVal;
+ bool AllZeros = Result == 0;
+ bool IsLeftMostBitSet = Result[Op1APVal.getActiveBits()] != 0;
+ return AllZeros ? 0 : AllOnes ? 3 : IsLeftMostBitSet ? 2 : 1;
+ };
+ SDValue Op0 = CCNode->getOperand(0);
+ SDValue Op1 = CCNode->getOperand(1);
+ auto [Op0CC, Op0CCValid] = findCCUse(Op0);
+ if (Op0CC == SDValue())
return false;
- if (CompareRHS->getAPIntValue() == FalseVal->getAPIntValue())
- Invert = !Invert;
- else if (CompareRHS->getAPIntValue() != TrueVal->getAPIntValue())
+ const auto &&Op0SDVals = simplifyAssumingCCVal(Op0, Op0CC, DAG);
+ const auto &&Op1SDVals = simplifyAssumingCCVal(Op1, Op0CC, DAG);
+ if (Op0SDVals.empty() || Op1SDVals.empty())
return false;
-
- // Compute the effective CC mask for the new branch or select.
- auto *NewCCValid = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(2));
- auto *NewCCMask = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(3));
- if (!NewCCValid || !NewCCMask)
- return false;
- CCValid = NewCCValid->getZExtValue();
- CCMask = NewCCMask->getZExtValue();
- if (Invert)
- CCMask ^= CCValid;
-
- // Return the updated CCReg link.
- CCReg = CompareLHS->getOperand(4);
+ int NewCCMask = 0;
+ for (auto CC : {0, 1, 2, 3}) {
+ auto CCVal = emulateTMCCMask(Op0SDVals[CC], Op1SDVals[CC]);
+ if (CCVal < 0)
+ return false;
+ NewCCMask <<= 1;
+ NewCCMask |= (CCMask & (1 << (3 - CCVal))) != 0;
+ }
+ NewCCMask &= Op0CCValid;
+ CCReg = Op0CC;
+ CCMask = NewCCMask;
+ CCValid = Op0CCValid;
return true;
}
+ if (CCNode->getOpcode() != SystemZISD::ICMP ||
+ CCValid != SystemZ::CCMASK_ICMP)
+ return false;
- // Optimize the case where CompareRHS is (SRA (SHL (IPM))).
- if (CompareLHS->getOpcode() == ISD::SRA) {
- auto *SRACount = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(1));
- if (!SRACount || SRACount->getZExtValue() != 30)
- return false;
- auto *SHL = CompareLHS->getOperand(0).getNode();
- if (SHL->getOpcode() != ISD::SHL)
- return false;
- auto *SHLCount = dyn_cast<ConstantSDNode>(SHL->getOperand(1));
- if (!SHLCount || SHLCount->getZExtValue() != 30 - SystemZ::IPM_CC)
- return false;
- auto *IPM = SHL->getOperand(0).getNode();
- if (IPM->getOpcode() != SystemZISD::IPM)
- return false;
-
- // Avoid introducing CC spills (because SRA would clobber CC).
- if (!CompareLHS->hasOneUse())
- return false;
- // Verify that the ICMP compares against zero.
- if (CompareRHS->getZExtValue() != 0)
+ SDValue CmpOp0 = CCNode->getOperand(0);
+ SDValue CmpOp1 = CCNode->getOperand(1);
+ SDValue CmpOp2 = CCNode->getOperand(2);
+ auto [Op0CC, Op0CCValid] = findCCUse(CmpOp0);
+ if (Op0CC != SDValue()) {
+ const auto &&Op0SDVals = simplifyAssumingCCVal(CmpOp0, Op0CC, DAG);
+ const auto &&Op1SDVals = simplifyAssumingCCVal(CmpOp1, Op0CC, DAG);
+ if (Op0SDVals.empty() || Op1SDVals.empty())
return false;
- // Compute the effective CC mask for the new branch or select.
- CCMask = SystemZ::reverseCCMask(CCMask);
-
- // Return the updated CCReg link.
- CCReg = IPM->getOperand(0);
+ auto *CmpType = dyn_cast<ConstantSDNode>(CmpOp2);
+ auto CmpTypeVal = CmpType->getZExtValue();
+ const auto compareCCSigned = [&CmpTypeVal](const SDValue &Op0Val,
+ const SDValue &Op1Val) {
+ auto *Op0Node = dyn_cast<ConstantSDNode>(Op0Val.getNode());
+ auto *Op1Node = dyn_cast<ConstantSDNode>(Op1Val.getNode());
+ if (!Op0Node || !Op1Node)
+ return -1;
+ auto Op0APVal = Op0Node->getAPIntValue();
+ auto Op1APVal = Op1Node->getAPIntValue();
+ if (CmpTypeVal == SystemZICMP::SignedOnly)
+ return Op0APVal == Op1APVal ? 0 : Op0APVal.slt(Op1APVal) ? 1 : 2;
+ return Op0APVal == Op1APVal ? 0 : Op0APVal.ult(Op1APVal) ? 1 : 2;
+ };
+ int NewCCMask = 0;
+ for (auto CC : {0, 1, 2, 3}) {
+ auto CCVal = compareCCSigned(Op0SDVals[CC], Op1SDVals[CC]);
+ if (CCVal < 0)
+ return false;
+ NewCCMask <<= 1;
+ NewCCMask |= (CCMask & (1 << (3 - CCVal))) != 0;
+ }
+ NewCCMask &= Op0CCValid;
+ CCMask = NewCCMask;
+ CCReg = Op0CC;
+ CCValid = Op0CCValid;
return true;
}
return false;
}
-SDValue SystemZTargetLowering::combineBR_CCMASK(
- SDNode *N, DAGCombinerInfo &DCI) const {
+// Merging versus split in multiple branches cost.
+TargetLoweringBase::CondMergingParams
+SystemZTargetLowering::getJumpConditionMergingParams(Instruction::BinaryOps Opc,
+ const Value *Lhs,
+ const Value *Rhs) const {
+ const auto isFlagOutOpCC = [](const Value *V) {
+ using namespace llvm::PatternMatch;
+ const Value *RHSVal;
+ const APInt *RHSC;
+ if (const auto *I = dyn_cast<Instruction>(V)) {
+ // PatternMatch.h provides concise tree-based pattern match of llvm IR.
+ if (match(I->getOperand(0), m_And(m_Value(RHSVal), m_APInt(RHSC))) ||
+ match(I, m_Cmp(m_Value(RHSVal), m_APInt(RHSC)))) {
+ if (const auto *CB = dyn_cast<CallBase>(RHSVal)) {
+ if (CB->isInlineAsm()) {
+ const InlineAsm *IA = cast<InlineAsm>(CB->getCalledOperand());
+ return IA &&
+ IA->getConstraintString().find("{@cc}") != std::string::npos;
+ }
+ }
+ }
+ }
+ return false;
+ };
+ // Pattern (ICmp %asm) or (ICmp (And %asm)).
+ // Cost of longest dependency chain (ICmp, And) is 2. CostThreshold or
+ // BaseCost can be set >=2. If cost of instruction <= CostThreshold
+ // conditionals will be merged or else conditionals will be split.
+ if (isFlagOutOpCC(Lhs) && isFlagOutOpCC(Rhs))
+ return {3, 0, -1};
+ // Default.
+ return {-1, -1, -1};
+}
+
+SDValue SystemZTargetLowering::combineBR_CCMASK(SDNode *N,
+ DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
// Combine BR_CCMASK (ICMP (SELECT_CCMASK)) into a single BR_CCMASK.
@@ -8824,8 +9009,7 @@ SDValue SystemZTargetLowering::combineBR_CCMASK(
int CCMaskVal = CCMask->getZExtValue();
SDValue Chain = N->getOperand(0);
SDValue CCReg = N->getOperand(4);
-
- if (combineCCMask(CCReg, CCValidVal, CCMaskVal))
+ if (combineCCMask(CCReg, CCValidVal, CCMaskVal, DAG))
return DAG.getNode(SystemZISD::BR_CCMASK, SDLoc(N), N->getValueType(0),
Chain,
DAG.getTargetConstant(CCValidVal, SDLoc(N), MVT::i32),
@@ -8848,16 +9032,80 @@ SDValue SystemZTargetLowering::combineSELECT_CCMASK(
int CCMaskVal = CCMask->getZExtValue();
SDValue CCReg = N->getOperand(4);
- if (combineCCMask(CCReg, CCValidVal, CCMaskVal))
- return DAG.getNode(SystemZISD::SELECT_CCMASK, SDLoc(N), N->getValueType(0),
- N->getOperand(0), N->getOperand(1),
- DAG.getTargetConstant(CCValidVal, SDLoc(N), MVT::i32),
- DAG.getTargetConstant(CCMaskVal, SDLoc(N), MVT::i32),
- CCReg);
+ bool IsCombinedCCReg = combineCCMask(CCReg, CCValidVal, CCMaskVal, DAG);
+
+ // Populate SDVals vector for each condition code ccval for given Val, which
+ // can again be another nested select_ccmask with the same CC.
+ const auto constructCCSDValsFromSELECT = [&CCReg](SDValue &Val) {
+ if (Val.getOpcode() == SystemZISD::SELECT_CCMASK) {
+ SmallVector<SDValue, 4> Res;
+ if (Val.getOperand(4) != CCReg)
+ return SmallVector<SDValue, 4>{};
+ SDValue TrueVal = Val.getOperand(0), FalseVal = Val.getOperand(1);
+ auto *CCMask = dyn_cast<ConstantSDNode>(Val.getOperand(3));
+ if (!CCMask)
+ return SmallVector<SDValue, 4>{};
+
+ int CCMaskVal = CCMask->getZExtValue();
+ for (auto &CC : {0, 1, 2, 3})
+ Res.emplace_back(((CCMaskVal & (1 << (3 - CC))) != 0) ? TrueVal
+ : FalseVal);
+ return Res;
+ }
+ return SmallVector<SDValue, 4>{Val, Val, Val, Val};
+ };
+ // Attempting to optimize TrueVal/FalseVal in outermost select_ccmask either
+ // with CCReg found by combineCCMask or original CCReg.
+ SDValue TrueVal = N->getOperand(0);
+ SDValue FalseVal = N->getOperand(1);
+ auto &&TrueSDVals = simplifyAssumingCCVal(TrueVal, CCReg, DAG);
+ auto &&FalseSDVals = simplifyAssumingCCVal(FalseVal, CCReg, DAG);
+ // TrueSDVals/FalseSDVals might be empty in case of non-constant
+ // TrueVal/FalseVal for select_ccmask, which can not be optimized further.
+ if (TrueSDVals.empty())
+ TrueSDVals = constructCCSDValsFromSELECT(TrueVal);
+ if (FalseSDVals.empty())
+ FalseSDVals = constructCCSDValsFromSELECT(FalseVal);
+ if (!TrueSDVals.empty() && !FalseSDVals.empty()) {
+ SmallSet<SDValue, 4> MergedSDValsSet;
+ // Ignoring CC values outside CCValiid.
+ for (auto CC : {0, 1, 2, 3}) {
+ if ((CCValidVal & ((1 << (3 - CC)))) != 0)
+ MergedSDValsSet.insert(((CCMaskVal & (1 << (3 - CC))) != 0)
+ ? TrueSDVals[CC]
+ : FalseSDVals[CC]);
+ }
+ if (MergedSDValsSet.size() == 1)
+ return *MergedSDValsSet.begin();
+ if (MergedSDValsSet.size() == 2) {
+ auto BeginIt = MergedSDValsSet.begin();
+ SDValue NewTrueVal = *BeginIt, NewFalseVal = *next(BeginIt);
+ if (NewTrueVal == FalseVal || NewFalseVal == TrueVal)
+ std::swap(NewTrueVal, NewFalseVal);
+ int NewCCMask = 0;
+ for (auto CC : {0, 1, 2, 3}) {
+ NewCCMask <<= 1;
+ NewCCMask |= ((CCMaskVal & (1 << (3 - CC))) != 0)
+ ? (TrueSDVals[CC] == NewTrueVal)
+ : (FalseSDVals[CC] == NewTrueVal);
+ }
+ CCMaskVal = NewCCMask;
+ CCMaskVal &= CCValidVal;
+ TrueVal = NewTrueVal;
+ FalseVal = NewFalseVal;
+ IsCombinedCCReg = true;
+ }
+ }
+
+ if (IsCombinedCCReg)
+ return DAG.getNode(
+ SystemZISD::SELECT_CCMASK, SDLoc(N), N->getValueType(0), TrueVal,
+ FalseVal, DAG.getTargetConstant(CCValidVal, SDLoc(N), MVT::i32),
+ DAG.getTargetConstant(CCMaskVal, SDLoc(N), MVT::i32), CCReg);
+
return SDValue();
}
-
SDValue SystemZTargetLowering::combineGET_CCMASK(
SDNode *N, DAGCombinerInfo &DCI) const {
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index f8706b7..d5b7603 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -533,6 +533,18 @@ public:
}
const char *getTargetNodeName(unsigned Opcode) const override;
+
+ // This function currently returns cost for srl/ipm/cc sequence for merging.
+ CondMergingParams
+ getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs,
+ const Value *Rhs) const override;
+
+ // Handle Lowering flag assembly outputs.
+ SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag,
+ const SDLoc &DL,
+ const AsmOperandInfo &Constraint,
+ SelectionDAG &DAG) const override;
+
std::pair<unsigned, const TargetRegisterClass *>
getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
StringRef Constraint, MVT VT) const override;
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp
index d9c8e22..6e99fc3 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp
@@ -23,7 +23,7 @@ std::optional<wasm::ValType> WebAssembly::parseType(StringRef Type) {
.Case("i64", wasm::ValType::I64)
.Case("f32", wasm::ValType::F32)
.Case("f64", wasm::ValType::F64)
- .Cases("v128", "i8x16", "i16x8", "i32x4", "i64x2", "f32x4", "f64x2",
+ .Cases({"v128", "i8x16", "i16x8", "i32x4", "i64x2", "f32x4", "f64x2"},
wasm::ValType::V128)
.Case("funcref", wasm::ValType::FUNCREF)
.Case("externref", wasm::ValType::EXTERNREF)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 6472334..f973949 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -317,6 +317,15 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, T, Custom);
}
+ if (Subtarget->hasFP16()) {
+ setOperationAction(ISD::FMA, MVT::v8f16, Legal);
+ }
+
+ if (Subtarget->hasRelaxedSIMD()) {
+ setOperationAction(ISD::FMULADD, MVT::v4f32, Legal);
+ setOperationAction(ISD::FMULADD, MVT::v2f64, Legal);
+ }
+
// Partial MLA reductions.
for (auto Op : {ISD::PARTIAL_REDUCE_SMLA, ISD::PARTIAL_REDUCE_UMLA}) {
setPartialReduceMLAAction(Op, MVT::v4i32, MVT::v16i8, Legal);
@@ -592,6 +601,29 @@ static MachineBasicBlock *LowerMemcpy(MachineInstr &MI, DebugLoc DL,
MachineOperand Src = MI.getOperand(3);
MachineOperand Len = MI.getOperand(4);
+ // If the length is a constant, we don't actually need the check.
+ if (MachineInstr *Def = MRI.getVRegDef(Len.getReg())) {
+ if (Def->getOpcode() == WebAssembly::CONST_I32 ||
+ Def->getOpcode() == WebAssembly::CONST_I64) {
+ if (Def->getOperand(1).getImm() == 0) {
+ // A zero-length memcpy is a no-op.
+ MI.eraseFromParent();
+ return BB;
+ }
+ // A non-zero-length memcpy doesn't need a zero check.
+ unsigned MemoryCopy =
+ Int64 ? WebAssembly::MEMORY_COPY_A64 : WebAssembly::MEMORY_COPY_A32;
+ BuildMI(*BB, MI, DL, TII.get(MemoryCopy))
+ .add(DstMem)
+ .add(SrcMem)
+ .add(Dst)
+ .add(Src)
+ .add(Len);
+ MI.eraseFromParent();
+ return BB;
+ }
+ }
+
// We're going to add an extra use to `Len` to test if it's zero; that
// use shouldn't be a kill, even if the original use is.
MachineOperand NoKillLen = Len;
@@ -660,6 +692,28 @@ static MachineBasicBlock *LowerMemset(MachineInstr &MI, DebugLoc DL,
MachineOperand Val = MI.getOperand(2);
MachineOperand Len = MI.getOperand(3);
+ // If the length is a constant, we don't actually need the check.
+ if (MachineInstr *Def = MRI.getVRegDef(Len.getReg())) {
+ if (Def->getOpcode() == WebAssembly::CONST_I32 ||
+ Def->getOpcode() == WebAssembly::CONST_I64) {
+ if (Def->getOperand(1).getImm() == 0) {
+ // A zero-length memset is a no-op.
+ MI.eraseFromParent();
+ return BB;
+ }
+ // A non-zero-length memset doesn't need a zero check.
+ unsigned MemoryFill =
+ Int64 ? WebAssembly::MEMORY_FILL_A64 : WebAssembly::MEMORY_FILL_A32;
+ BuildMI(*BB, MI, DL, TII.get(MemoryFill))
+ .add(Mem)
+ .add(Dst)
+ .add(Val)
+ .add(Len);
+ MI.eraseFromParent();
+ return BB;
+ }
+ }
+
// We're going to add an extra use to `Len` to test if it's zero; that
// use shouldn't be a kill, even if the original use is.
MachineOperand NoKillLen = Len;
@@ -1120,6 +1174,18 @@ WebAssemblyTargetLowering::getPreferredVectorAction(MVT VT) const {
return TargetLoweringBase::getPreferredVectorAction(VT);
}
+bool WebAssemblyTargetLowering::isFMAFasterThanFMulAndFAdd(
+ const MachineFunction &MF, EVT VT) const {
+ if (!Subtarget->hasFP16() || !VT.isVector())
+ return false;
+
+ EVT ScalarVT = VT.getScalarType();
+ if (!ScalarVT.isSimple())
+ return false;
+
+ return ScalarVT.getSimpleVT().SimpleTy == MVT::f16;
+}
+
bool WebAssemblyTargetLowering::shouldSimplifyDemandedVectorElts(
SDValue Op, const TargetLoweringOpt &TLO) const {
// ISel process runs DAGCombiner after legalization; this step is called
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index b33a853..472ec67 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -81,6 +81,8 @@ private:
TargetLoweringBase::LegalizeTypeAction
getPreferredVectorAction(MVT VT) const override;
+ bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+ EVT VT) const override;
SDValue LowerCall(CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const override;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 49af78b..ed54404d 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1213,6 +1213,27 @@ defm EXTMUL_LOW_U :
defm EXTMUL_HIGH_U :
SIMDExtBinary<I64x2, extmul_high_u, "extmul_high_i32x4_u", 0xdf>;
+// Pattern for i32x4.dot_i16x8_s
+def : Pat<
+ (v4i32 (add
+ (wasm_shuffle
+ (v4i32 (extmul_low_s v8i16:$lhs, v8i16:$rhs)),
+ (v4i32 (extmul_high_s v8i16:$lhs, v8i16:$rhs)),
+ (i32 0), (i32 1), (i32 2), (i32 3),
+ (i32 8), (i32 9), (i32 10), (i32 11),
+ (i32 16), (i32 17), (i32 18), (i32 19),
+ (i32 24), (i32 25), (i32 26), (i32 27)),
+ (wasm_shuffle
+ (v4i32 (extmul_low_s v8i16:$lhs, v8i16:$rhs)),
+ (v4i32 (extmul_high_s v8i16:$lhs, v8i16:$rhs)),
+ (i32 4), (i32 5), (i32 6), (i32 7),
+ (i32 12), (i32 13), (i32 14), (i32 15),
+ (i32 20), (i32 21), (i32 22), (i32 23),
+ (i32 28), (i32 29), (i32 30), (i32 31)))
+ ),
+ (v4i32 (DOT v8i16:$lhs, v8i16:$rhs))
+>;
+
//===----------------------------------------------------------------------===//
// Floating-point unary arithmetic
//===----------------------------------------------------------------------===//
@@ -1626,7 +1647,8 @@ defm "" : RelaxedConvert<I32x4, F64x2, int_wasm_relaxed_trunc_unsigned_zero,
// Relaxed (Negative) Multiply-Add (madd/nmadd)
//===----------------------------------------------------------------------===//
-multiclass SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS, list<Predicate> reqs> {
+multiclass RELAXED_SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS,
+ list<Predicate> reqs> {
defm MADD_#vec :
SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), (outs), (ins),
[(set (vec.vt V128:$dst), (int_wasm_relaxed_madd
@@ -1640,16 +1662,46 @@ multiclass SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS, list<Predicate>
vec.prefix#".relaxed_nmadd\t$dst, $a, $b, $c",
vec.prefix#".relaxed_nmadd", simdopS, reqs>;
- def : Pat<(fadd_contract (vec.vt V128:$a), (fmul_contract (vec.vt V128:$b), (vec.vt V128:$c))),
- (!cast<Instruction>("MADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>;
+ def : Pat<(fadd_contract (fmul_contract (vec.vt V128:$a), (vec.vt V128:$b)), (vec.vt V128:$c)),
+ (!cast<Instruction>("MADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<reqs>;
+ def : Pat<(fmuladd (vec.vt V128:$a), (vec.vt V128:$b), (vec.vt V128:$c)),
+ (!cast<Instruction>("MADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<reqs>;
+
+ def : Pat<(fsub_contract (vec.vt V128:$c), (fmul_contract (vec.vt V128:$a), (vec.vt V128:$b))),
+ (!cast<Instruction>("NMADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<reqs>;
+ def : Pat<(fmuladd (fneg (vec.vt V128:$a)), (vec.vt V128:$b), (vec.vt V128:$c)),
+ (!cast<Instruction>("NMADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<reqs>;
+}
+
+defm "" : RELAXED_SIMDMADD<F32x4, 0x105, 0x106, [HasRelaxedSIMD]>;
+defm "" : RELAXED_SIMDMADD<F64x2, 0x107, 0x108, [HasRelaxedSIMD]>;
- def : Pat<(fsub_contract (vec.vt V128:$a), (fmul_contract (vec.vt V128:$b), (vec.vt V128:$c))),
- (!cast<Instruction>("NMADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>;
+//===----------------------------------------------------------------------===//
+// FP16 (Negative) Multiply-Add (madd/nmadd)
+//===----------------------------------------------------------------------===//
+
+multiclass HALF_PRECISION_SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS,
+ list<Predicate> reqs> {
+ defm MADD_#vec :
+ SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), (outs), (ins),
+ [(set (vec.vt V128:$dst), (fma
+ (vec.vt V128:$a), (vec.vt V128:$b), (vec.vt V128:$c)))],
+ vec.prefix#".madd\t$dst, $a, $b, $c",
+ vec.prefix#".madd", simdopA, reqs>;
+ defm NMADD_#vec :
+ SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), (outs), (ins),
+ [(set (vec.vt V128:$dst), (fma
+ (fneg (vec.vt V128:$a)), (vec.vt V128:$b), (vec.vt V128:$c)))],
+ vec.prefix#".nmadd\t$dst, $a, $b, $c",
+ vec.prefix#".nmadd", simdopS, reqs>;
}
+defm "" : HALF_PRECISION_SIMDMADD<F16x8, 0x14e, 0x14f, [HasFP16]>;
-defm "" : SIMDMADD<F32x4, 0x105, 0x106, [HasRelaxedSIMD]>;
-defm "" : SIMDMADD<F64x2, 0x107, 0x108, [HasRelaxedSIMD]>;
-defm "" : SIMDMADD<F16x8, 0x14e, 0x14f, [HasFP16]>;
+// TODO: I think separate intrinsics should be introduced for these FP16 operations.
+def : Pat<(v8f16 (int_wasm_relaxed_madd (v8f16 V128:$a), (v8f16 V128:$b), (v8f16 V128:$c))),
+ (MADD_F16x8 V128:$a, V128:$b, V128:$c)>;
+def : Pat<(v8f16 (int_wasm_relaxed_nmadd (v8f16 V128:$a), (v8f16 V128:$b), (v8f16 V128:$c))),
+ (NMADD_F16x8 V128:$a, V128:$b, V128:$c)>;
//===----------------------------------------------------------------------===//
// Laneselect
@@ -1711,6 +1763,26 @@ defm RELAXED_DOT :
"i16x8.relaxed_dot_i8x16_i7x16_s\t$dst, $lhs, $rhs",
"i16x8.relaxed_dot_i8x16_i7x16_s", 0x112>;
+def : Pat<
+ (v8i16 (add
+ (wasm_shuffle
+ (v8i16 (extmul_low_s v16i8:$lhs, v16i8:$rhs)),
+ (v8i16 (extmul_high_s v16i8:$lhs, v16i8:$rhs)),
+ (i32 0), (i32 1), (i32 4), (i32 5),
+ (i32 8), (i32 9), (i32 12), (i32 13),
+ (i32 16), (i32 17), (i32 20), (i32 21),
+ (i32 24), (i32 25), (i32 28), (i32 29)),
+ (wasm_shuffle
+ (v8i16 (extmul_low_s v16i8:$lhs, v16i8:$rhs)),
+ (v8i16 (extmul_high_s v16i8:$lhs, v16i8:$rhs)),
+ (i32 2), (i32 3), (i32 6), (i32 7),
+ (i32 10), (i32 11), (i32 14), (i32 15),
+ (i32 18), (i32 19), (i32 22), (i32 23),
+ (i32 26), (i32 27), (i32 30), (i32 31)))
+ ),
+ (v8i16 (RELAXED_DOT v16i8:$lhs, v16i8:$rhs))
+>;
+
defm RELAXED_DOT_ADD :
RELAXED_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs, V128:$acc),
(outs), (ins),
@@ -1719,6 +1791,18 @@ defm RELAXED_DOT_ADD :
"i32x4.relaxed_dot_i8x16_i7x16_add_s\t$dst, $lhs, $rhs, $acc",
"i32x4.relaxed_dot_i8x16_i7x16_add_s", 0x113>;
+def : Pat<
+ (v4i32 (add
+ (v4i32 (int_wasm_extadd_pairwise_signed
+ (v8i16 (int_wasm_relaxed_dot_i8x16_i7x16_signed v16i8:$lhs, v16i8:$rhs)))),
+ (v4i32 V128:$acc))),
+ (v4i32 (RELAXED_DOT_ADD v16i8:$lhs, v16i8:$rhs, (v4i32 V128:$acc)))
+ >;
+
+def : Pat<(v4i32 (partial_reduce_smla (v4i32 V128:$acc), (v16i8 V128:$lhs),
+ (v16i8 V128:$rhs))),
+ (RELAXED_DOT_ADD $lhs, $rhs, $acc)>, Requires<[HasRelaxedSIMD]>;
+
//===----------------------------------------------------------------------===//
// Relaxed BFloat16 dot product
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index a8908d4..ac251fd 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -3514,15 +3514,16 @@ bool X86AsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Name,
// xacquire <insn> ; xacquire must be accompanied by 'lock'
bool IsPrefix =
StringSwitch<bool>(Name)
- .Cases("cs", "ds", "es", "fs", "gs", "ss", true)
- .Cases("rex64", "data32", "data16", "addr32", "addr16", true)
- .Cases("xacquire", "xrelease", true)
- .Cases("acquire", "release", isParsingIntelSyntax())
+ .Cases({"cs", "ds", "es", "fs", "gs", "ss"}, true)
+ .Cases({"rex64", "data32", "data16", "addr32", "addr16"}, true)
+ .Cases({"xacquire", "xrelease"}, true)
+ .Cases({"acquire", "release"}, isParsingIntelSyntax())
.Default(false);
auto isLockRepeatNtPrefix = [](StringRef N) {
return StringSwitch<bool>(N)
- .Cases("lock", "rep", "repe", "repz", "repne", "repnz", "notrack", true)
+ .Cases({"lock", "rep", "repe", "repz", "repne", "repnz", "notrack"},
+ true)
.Default(false);
};
diff --git a/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp b/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp
index 27fba34..100f1ec 100644
--- a/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp
+++ b/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp
@@ -1164,14 +1164,13 @@ bool X86InstructionSelector::selectUAddSub(MachineInstr &I,
I.getOpcode() == TargetOpcode::G_USUBO) &&
"unexpected instruction");
- const Register DstReg = I.getOperand(0).getReg();
- const Register CarryOutReg = I.getOperand(1).getReg();
- const Register Op0Reg = I.getOperand(2).getReg();
- const Register Op1Reg = I.getOperand(3).getReg();
- bool IsSub = I.getOpcode() == TargetOpcode::G_USUBE ||
- I.getOpcode() == TargetOpcode::G_USUBO;
- bool HasCarryIn = I.getOpcode() == TargetOpcode::G_UADDE ||
- I.getOpcode() == TargetOpcode::G_USUBE;
+ auto &CarryMI = cast<GAddSubCarryOut>(I);
+
+ const Register DstReg = CarryMI.getDstReg();
+ const Register CarryOutReg = CarryMI.getCarryOutReg();
+ const Register Op0Reg = CarryMI.getLHSReg();
+ const Register Op1Reg = CarryMI.getRHSReg();
+ bool IsSub = CarryMI.isSub();
const LLT DstTy = MRI.getType(DstReg);
assert(DstTy.isScalar() && "selectUAddSub only supported for scalar types");
@@ -1207,14 +1206,15 @@ bool X86InstructionSelector::selectUAddSub(MachineInstr &I,
llvm_unreachable("selectUAddSub unsupported type.");
}
- const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
- const TargetRegisterClass *DstRC = getRegClass(DstTy, DstRB);
+ const RegisterBank &CarryRB = *RBI.getRegBank(CarryOutReg, MRI, TRI);
+ const TargetRegisterClass *CarryRC =
+ getRegClass(MRI.getType(CarryOutReg), CarryRB);
unsigned Opcode = IsSub ? OpSUB : OpADD;
// G_UADDE/G_USUBE - find CarryIn def instruction.
- if (HasCarryIn) {
- Register CarryInReg = I.getOperand(4).getReg();
+ if (auto CarryInMI = dyn_cast<GAddSubCarryInOut>(&I)) {
+ Register CarryInReg = CarryInMI->getCarryInReg();
MachineInstr *Def = MRI.getVRegDef(CarryInReg);
while (Def->getOpcode() == TargetOpcode::G_TRUNC) {
CarryInReg = Def->getOperand(1).getReg();
@@ -1227,11 +1227,12 @@ bool X86InstructionSelector::selectUAddSub(MachineInstr &I,
Def->getOpcode() == TargetOpcode::G_USUBE ||
Def->getOpcode() == TargetOpcode::G_USUBO) {
// carry set by prev ADD/SUB.
- BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::COPY),
- X86::EFLAGS)
- .addReg(CarryInReg);
- if (!RBI.constrainGenericRegister(CarryInReg, *DstRC, MRI))
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::CMP8ri))
+ .addReg(CarryInReg)
+ .addImm(1);
+
+ if (!RBI.constrainGenericRegister(CarryInReg, *CarryRC, MRI))
return false;
Opcode = IsSub ? OpSBB : OpADC;
@@ -1250,11 +1251,11 @@ bool X86InstructionSelector::selectUAddSub(MachineInstr &I,
.addReg(Op0Reg)
.addReg(Op1Reg);
- BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::COPY), CarryOutReg)
- .addReg(X86::EFLAGS);
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::SETCCr), CarryOutReg)
+ .addImm(X86::COND_B);
if (!constrainSelectedInstRegOperands(Inst, TII, TRI, RBI) ||
- !RBI.constrainGenericRegister(CarryOutReg, *DstRC, MRI))
+ !RBI.constrainGenericRegister(CarryOutReg, *CarryRC, MRI))
return false;
I.eraseFromParent();
diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
index 11ef721..28fa2cd 100644
--- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
+++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
@@ -194,11 +194,11 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
.scalarize(0);
getActionDefinitionsBuilder({G_UADDE, G_UADDO, G_USUBE, G_USUBO})
- .legalFor({{s8, s1}, {s16, s1}, {s32, s1}})
- .legalFor(Is64Bit, {{s64, s1}})
+ .legalFor({{s8, s8}, {s16, s8}, {s32, s8}})
+ .legalFor(Is64Bit, {{s64, s8}})
.widenScalarToNextPow2(0, /*Min=*/32)
.clampScalar(0, s8, sMaxScalar)
- .clampScalar(1, s1, s1)
+ .clampScalar(1, s8, s8)
.scalarize(0);
// integer multiply
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 3af8b3e..8e08d16 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -1335,10 +1335,12 @@ def ProcessorFeatures {
!listconcat(ARLFeatures, ARLSAdditionalFeatures);
// Pantherlake
- list<SubtargetFeature> PTLAdditionalFeatures = [FeaturePREFETCHI];
list<SubtargetFeature> PTLFeatures =
- !listremove(!listconcat(ARLSFeatures, PTLAdditionalFeatures), [FeatureWIDEKL]);
+ !listremove(ARLSFeatures, [FeatureWIDEKL]);
+ // Novalake
+ list<SubtargetFeature> NVLFeatures =
+ !listconcat(PTLFeatures, [FeaturePREFETCHI]);
// Clearwaterforest
list<SubtargetFeature> CWFAdditionalFeatures = [FeaturePREFETCHI,
@@ -1881,8 +1883,13 @@ def : ProcModel<P, AlderlakePModel,
}
def : ProcModel<"lunarlake", LunarlakePModel, ProcessorFeatures.ARLSFeatures,
ProcessorFeatures.ADLTuning>;
-def : ProcModel<"pantherlake", AlderlakePModel,
+foreach P = ["pantherlake", "wildcatlake"] in {
+def : ProcModel<P, AlderlakePModel,
ProcessorFeatures.PTLFeatures, ProcessorFeatures.ADLTuning>;
+}
+def : ProcModel<"novalake", AlderlakePModel, ProcessorFeatures.NVLFeatures,
+ ProcessorFeatures.ADLTuning>;
+
def : ProcModel<"clearwaterforest", AlderlakePModel,
ProcessorFeatures.CWFFeatures, ProcessorFeatures.ADLTuning>;
def : ProcModel<"emeraldrapids", SapphireRapidsModel,
diff --git a/llvm/lib/Target/X86/X86FloatingPoint.cpp b/llvm/lib/Target/X86/X86FloatingPoint.cpp
index e0991aa..9f88fda 100644
--- a/llvm/lib/Target/X86/X86FloatingPoint.cpp
+++ b/llvm/lib/Target/X86/X86FloatingPoint.cpp
@@ -602,8 +602,7 @@ namespace {
friend bool operator<(const TableEntry &TE, unsigned V) {
return TE.from < V;
}
- friend bool LLVM_ATTRIBUTE_UNUSED operator<(unsigned V,
- const TableEntry &TE) {
+ [[maybe_unused]] friend bool operator<(unsigned V, const TableEntry &TE) {
return V < TE.from;
}
};
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index eea84a2..2feee05 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3624,6 +3624,16 @@ X86TargetLowering::getJumpConditionMergingParams(Instruction::BinaryOps Opc,
match(Lhs, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(), m_Value())) &&
match(Rhs, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(), m_Value())))
BaseCost += 1;
+
+ // For OR conditions with EQ comparisons, prefer splitting into branches
+ // (unless CCMP is available). OR+EQ cannot be optimized via bitwise ops,
+ // unlike OR+NE which becomes (P|Q)!=0. Similarly, don't split signed
+ // comparisons (SLT, SGT) that can be optimized.
+ if (BaseCost >= 0 && !Subtarget.hasCCMP() && Opc == Instruction::Or &&
+ match(Lhs, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(), m_Value())) &&
+ match(Rhs, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(), m_Value())))
+ return {-1, -1, -1};
+
return {BaseCost, BrMergingLikelyBias.getValue(),
BrMergingUnlikelyBias.getValue()};
}
@@ -3787,7 +3797,7 @@ static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
/// Return true if every element in Mask, is an in-place blend/select mask or is
/// undef.
-LLVM_ATTRIBUTE_UNUSED static bool isBlendOrUndef(ArrayRef<int> Mask) {
+[[maybe_unused]] static bool isBlendOrUndef(ArrayRef<int> Mask) {
unsigned NumElts = Mask.size();
for (auto [I, M] : enumerate(Mask))
if (!isUndefOrEqual(M, I) && !isUndefOrEqual(M, I + NumElts))
@@ -8096,7 +8106,7 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl,
return DstVec;
}
-LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) {
+[[maybe_unused]] static bool isHorizOp(unsigned Opcode) {
switch (Opcode) {
case X86ISD::PACKSS:
case X86ISD::PACKUS:
@@ -20813,7 +20823,7 @@ SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
// for DAG type consistency we have to match the FP operand type.
APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
- LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
+ [[maybe_unused]] APFloat::opStatus Status = APFloat::opOK;
bool LosesInfo = false;
if (TheVT == MVT::f64)
// The rounding mode is irrelevant as the conversion should be exact.
@@ -22856,7 +22866,7 @@ static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y,
// be generated by the memcmp expansion pass with oversized integer compares
// (see PR33325).
bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
- if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
+ if (isNullConstant(Y) && OpSize == 128 && !IsOrXorXorTreeCCZero)
return SDValue();
// Don't perform this combine if constructing the vector will be expensive.
@@ -29745,65 +29755,30 @@ static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
const X86Subtarget &Subtarget,
SelectionDAG &DAG,
SDValue *Low = nullptr) {
- unsigned NumElts = VT.getVectorNumElements();
-
// For vXi8 we will unpack the low and high half of each 128 bit lane to widen
// to a vXi16 type. Do the multiplies, shift the results and pack the half
// lane results back together.
// We'll take different approaches for signed and unsigned.
- // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
- // and use pmullw to calculate the full 16-bit product.
+ // For unsigned we'll use punpcklbw/punpckhbw to zero extend the bytes to
+ // words and use pmullw to calculate the full 16-bit product.
// For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
// shift them left into the upper byte of each word. This allows us to use
// pmulhw to calculate the full 16-bit product. This trick means we don't
// need to sign extend the bytes to use pmullw.
-
- MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
+ MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
SDValue Zero = DAG.getConstant(0, dl, VT);
- SDValue ALo, AHi;
+ SDValue ALo, AHi, BLo, BHi;
if (IsSigned) {
ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
- AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
- } else {
- ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
- AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
- }
-
- SDValue BLo, BHi;
- if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
- // If the RHS is a constant, manually unpackl/unpackh and extend.
- SmallVector<SDValue, 16> LoOps, HiOps;
- for (unsigned i = 0; i != NumElts; i += 16) {
- for (unsigned j = 0; j != 8; ++j) {
- SDValue LoOp = B.getOperand(i + j);
- SDValue HiOp = B.getOperand(i + j + 8);
-
- if (IsSigned) {
- LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
- HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
- LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
- DAG.getConstant(8, dl, MVT::i16));
- HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
- DAG.getConstant(8, dl, MVT::i16));
- } else {
- LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
- HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
- }
-
- LoOps.push_back(LoOp);
- HiOps.push_back(HiOp);
- }
- }
-
- BLo = DAG.getBuildVector(ExVT, dl, LoOps);
- BHi = DAG.getBuildVector(ExVT, dl, HiOps);
- } else if (IsSigned) {
BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
+ AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
} else {
+ ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
+ AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
}
@@ -29816,7 +29791,7 @@ static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
if (Low)
*Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
- return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
+ return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf=*/true);
}
static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
@@ -47751,6 +47726,15 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
DL, DAG, Subtarget))
return V;
+ // If the sign bit is known then BLENDV can be folded away.
+ if (N->getOpcode() == X86ISD::BLENDV) {
+ KnownBits KnownCond = DAG.computeKnownBits(Cond);
+ if (KnownCond.isNegative())
+ return LHS;
+ if (KnownCond.isNonNegative())
+ return RHS;
+ }
+
if (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV) {
SmallVector<int, 64> CondMask;
if (createShuffleMaskFromVSELECT(CondMask, Cond,
@@ -58332,11 +58316,12 @@ static SDValue combineX86CloadCstore(SDNode *N, SelectionDAG &DAG) {
} else if (Op1.getOpcode() == ISD::AND && Sub.getValue(0).use_empty()) {
SDValue Src = Op1;
SDValue Op10 = Op1.getOperand(0);
- if (Op10.getOpcode() == ISD::XOR && isAllOnesConstant(Op10.getOperand(1))) {
- // res, flags2 = sub 0, (and (xor X, -1), Y)
+ if (Op10.getOpcode() == ISD::XOR && isAllOnesConstant(Op10.getOperand(1)) &&
+ llvm::isOneConstant(Op1.getOperand(1))) {
+ // res, flags2 = sub 0, (and (xor X, -1), 1)
// cload/cstore ..., cond_ne, flag2
// ->
- // res, flags2 = sub 0, (and X, Y)
+ // res, flags2 = sub 0, (and X, 1)
// cload/cstore ..., cond_e, flag2
Src = DAG.getNode(ISD::AND, DL, Op1.getValueType(), Op10.getOperand(0),
Op1.getOperand(1));
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index 6dd43b2..37d7772 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -606,16 +606,24 @@ Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
void X86TargetLowering::insertSSPDeclarations(Module &M) const {
// MSVC CRT provides functionalities for stack protection.
- if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
- Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
+ RTLIB::LibcallImpl SecurityCheckCookieLibcall =
+ getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
+
+ RTLIB::LibcallImpl SecurityCookieVar =
+ getLibcallImpl(RTLIB::STACK_CHECK_GUARD);
+ if (SecurityCheckCookieLibcall != RTLIB::Unsupported &&
+ SecurityCookieVar != RTLIB::Unsupported) {
+ // MSVC CRT provides functionalities for stack protection.
// MSVC CRT has a global variable holding security cookie.
- M.getOrInsertGlobal("__security_cookie",
+ M.getOrInsertGlobal(getLibcallImplName(SecurityCookieVar),
PointerType::getUnqual(M.getContext()));
// MSVC CRT has a function to validate security cookie.
- FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
- "__security_check_cookie", Type::getVoidTy(M.getContext()),
- PointerType::getUnqual(M.getContext()));
+ FunctionCallee SecurityCheckCookie =
+ M.getOrInsertFunction(getLibcallImplName(SecurityCheckCookieLibcall),
+ Type::getVoidTy(M.getContext()),
+ PointerType::getUnqual(M.getContext()));
+
if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
F->setCallingConv(CallingConv::X86_FastCall);
F->addParamAttr(0, Attribute::AttrKind::InReg);
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index 0fd44b7..ec31675 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1256,8 +1256,17 @@ def : Pat<(i64 (X86Wrapper tconstpool :$dst)),
(MOV64ri32 tconstpool :$dst)>, Requires<[KernelCode]>;
def : Pat<(i64 (X86Wrapper tjumptable :$dst)),
(MOV64ri32 tjumptable :$dst)>, Requires<[KernelCode]>;
-def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
- (MOV64ri32 tglobaladdr :$dst)>, Requires<[KernelCode]>;
+
+// If the globaladdr is an absolute_symbol, don't bother using the sign extending
+// instruction since there's no benefit to using it with absolute symbols.
+def globalAddrNoAbsSym : PatLeaf<(tglobaladdr:$dst), [{
+ auto *GA = cast<GlobalAddressSDNode>(N);
+ return !GA->getGlobal()->getAbsoluteSymbolRange();
+}]>;
+def : Pat<(i64 (X86Wrapper globalAddrNoAbsSym:$dst)),
+ (MOV64ri32 tglobaladdr:$dst)>,
+ Requires<[KernelCode]>;
+
def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
(MOV64ri32 texternalsym:$dst)>, Requires<[KernelCode]>;
def : Pat<(i64 (X86Wrapper mcsym:$dst)),
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 1d2cd39..5c23f91 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -10809,39 +10809,27 @@ void X86InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
if (!ST.hasSSE1())
return;
- // PXOR is safe to use because it doesn't affect flags.
- BuildMI(MBB, Iter, DL, get(X86::PXORrr), Reg)
- .addReg(Reg, RegState::Undef)
- .addReg(Reg, RegState::Undef);
+ BuildMI(MBB, Iter, DL, get(X86::V_SET0), Reg);
} else if (X86::VR256RegClass.contains(Reg)) {
// YMM#
if (!ST.hasAVX())
return;
- // VPXOR is safe to use because it doesn't affect flags.
- BuildMI(MBB, Iter, DL, get(X86::VPXORrr), Reg)
- .addReg(Reg, RegState::Undef)
- .addReg(Reg, RegState::Undef);
+ BuildMI(MBB, Iter, DL, get(X86::AVX_SET0), Reg);
} else if (X86::VR512RegClass.contains(Reg)) {
// ZMM#
if (!ST.hasAVX512())
return;
- // VPXORY is safe to use because it doesn't affect flags.
- BuildMI(MBB, Iter, DL, get(X86::VPXORYrr), Reg)
- .addReg(Reg, RegState::Undef)
- .addReg(Reg, RegState::Undef);
+ BuildMI(MBB, Iter, DL, get(X86::AVX512_512_SET0), Reg);
} else if (X86::VK1RegClass.contains(Reg) || X86::VK2RegClass.contains(Reg) ||
X86::VK4RegClass.contains(Reg) || X86::VK8RegClass.contains(Reg) ||
X86::VK16RegClass.contains(Reg)) {
if (!ST.hasVLX())
return;
- // KXOR is safe to use because it doesn't affect flags.
- unsigned Op = ST.hasBWI() ? X86::KXORQkk : X86::KXORWkk;
- BuildMI(MBB, Iter, DL, get(Op), Reg)
- .addReg(Reg, RegState::Undef)
- .addReg(Reg, RegState::Undef);
+ unsigned Op = ST.hasBWI() ? X86::KSET0Q : X86::KSET0W;
+ BuildMI(MBB, Iter, DL, get(Op), Reg);
}
}
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 481a9be..713d504 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -1928,6 +1928,17 @@ static void addConstantComments(const MachineInstr *MI,
#define INSTR_CASE(Prefix, Instr, Suffix, Postfix) \
case X86::Prefix##Instr##Suffix##rm##Postfix:
+#define CASE_AVX512_ARITH_RM(Instr) \
+ INSTR_CASE(V, Instr, Z128, ) \
+ INSTR_CASE(V, Instr, Z128, k) \
+ INSTR_CASE(V, Instr, Z128, kz) \
+ INSTR_CASE(V, Instr, Z256, ) \
+ INSTR_CASE(V, Instr, Z256, k) \
+ INSTR_CASE(V, Instr, Z256, kz) \
+ INSTR_CASE(V, Instr, Z, ) \
+ INSTR_CASE(V, Instr, Z, k) \
+ INSTR_CASE(V, Instr, Z, kz)
+
#define CASE_ARITH_RM(Instr) \
INSTR_CASE(, Instr, , ) /* SSE */ \
INSTR_CASE(V, Instr, , ) /* AVX-128 */ \
@@ -1943,40 +1954,26 @@ static void addConstantComments(const MachineInstr *MI,
INSTR_CASE(V, Instr, Z, kz)
// TODO: Add additional instructions when useful.
- CASE_ARITH_RM(PMADDUBSW) {
- unsigned SrcIdx = getSrcIdx(MI, 1);
- if (auto *C = X86::getConstantFromPool(*MI, SrcIdx + 1)) {
- if (C->getType()->getScalarSizeInBits() == 8) {
- std::string Comment;
- raw_string_ostream CS(Comment);
- unsigned VectorWidth =
- X86::getVectorRegisterWidth(MI->getDesc().operands()[0]);
- CS << "[";
- printConstant(C, VectorWidth, CS);
- CS << "]";
- OutStreamer.AddComment(CS.str());
- }
- }
- break;
- }
-
+ CASE_ARITH_RM(PMADDUBSW)
CASE_ARITH_RM(PMADDWD)
+ CASE_ARITH_RM(PMULDQ)
+ CASE_ARITH_RM(PMULUDQ)
+ CASE_ARITH_RM(PMULLD)
+ CASE_AVX512_ARITH_RM(PMULLQ)
CASE_ARITH_RM(PMULLW)
CASE_ARITH_RM(PMULHW)
CASE_ARITH_RM(PMULHUW)
CASE_ARITH_RM(PMULHRSW) {
unsigned SrcIdx = getSrcIdx(MI, 1);
if (auto *C = X86::getConstantFromPool(*MI, SrcIdx + 1)) {
- if (C->getType()->getScalarSizeInBits() == 16) {
- std::string Comment;
- raw_string_ostream CS(Comment);
- unsigned VectorWidth =
- X86::getVectorRegisterWidth(MI->getDesc().operands()[0]);
- CS << "[";
- printConstant(C, VectorWidth, CS);
- CS << "]";
- OutStreamer.AddComment(CS.str());
- }
+ std::string Comment;
+ raw_string_ostream CS(Comment);
+ unsigned VectorWidth =
+ X86::getVectorRegisterWidth(MI->getDesc().operands()[0]);
+ CS << "[";
+ printConstant(C, VectorWidth, CS);
+ CS << "]";
+ OutStreamer.AddComment(CS.str());
}
break;
}
diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
index 080a9c0..4e73070 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
@@ -84,11 +84,11 @@ bool Xtensa::checkRegister(MCRegister RegNo, const FeatureBitset &FeatureBits,
case Xtensa::CCOMPARE0:
if (FeatureBits[Xtensa::FeatureTimers1])
return true;
- LLVM_FALLTHROUGH;
+ [[fallthrough]];
case Xtensa::CCOMPARE1:
if (FeatureBits[Xtensa::FeatureTimers2])
return true;
- LLVM_FALLTHROUGH;
+ [[fallthrough]];
case Xtensa::CCOMPARE2:
if (FeatureBits[Xtensa::FeatureTimers3])
return true;
@@ -107,37 +107,37 @@ bool Xtensa::checkRegister(MCRegister RegNo, const FeatureBitset &FeatureBits,
case Xtensa::EXCSAVE1:
case Xtensa::EXCVADDR:
return FeatureBits[Xtensa::FeatureException];
- LLVM_FALLTHROUGH;
+ [[fallthrough]];
case Xtensa::EPC2:
case Xtensa::EPS2:
case Xtensa::EXCSAVE2:
if (FeatureBits[Xtensa::FeatureHighPriInterrupts])
return true;
- LLVM_FALLTHROUGH;
+ [[fallthrough]];
case Xtensa::EPC3:
case Xtensa::EPS3:
case Xtensa::EXCSAVE3:
if (FeatureBits[Xtensa::FeatureHighPriInterruptsLevel3])
return true;
- LLVM_FALLTHROUGH;
+ [[fallthrough]];
case Xtensa::EPC4:
case Xtensa::EPS4:
case Xtensa::EXCSAVE4:
if (FeatureBits[Xtensa::FeatureHighPriInterruptsLevel4])
return true;
- LLVM_FALLTHROUGH;
+ [[fallthrough]];
case Xtensa::EPC5:
case Xtensa::EPS5:
case Xtensa::EXCSAVE5:
if (FeatureBits[Xtensa::FeatureHighPriInterruptsLevel5])
return true;
- LLVM_FALLTHROUGH;
+ [[fallthrough]];
case Xtensa::EPC6:
case Xtensa::EPS6:
case Xtensa::EXCSAVE6:
if (FeatureBits[Xtensa::FeatureHighPriInterruptsLevel6])
return true;
- LLVM_FALLTHROUGH;
+ [[fallthrough]];
case Xtensa::EPC7:
case Xtensa::EPS7:
case Xtensa::EXCSAVE7:
diff --git a/llvm/lib/TargetParser/ARMTargetParserCommon.cpp b/llvm/lib/TargetParser/ARMTargetParserCommon.cpp
index 89d5e0d..f6cea85 100644
--- a/llvm/lib/TargetParser/ARMTargetParserCommon.cpp
+++ b/llvm/lib/TargetParser/ARMTargetParserCommon.cpp
@@ -22,13 +22,13 @@ StringRef ARM::getArchSynonym(StringRef Arch) {
.Case("v5e", "v5te")
.Case("v6j", "v6")
.Case("v6hl", "v6k")
- .Cases("v6m", "v6sm", "v6s-m", "v6-m")
- .Cases("v6z", "v6zk", "v6kz")
- .Cases("v7", "v7a", "v7hl", "v7l", "v7-a")
+ .Cases({"v6m", "v6sm", "v6s-m"}, "v6-m")
+ .Cases({"v6z", "v6zk"}, "v6kz")
+ .Cases({"v7", "v7a", "v7hl", "v7l"}, "v7-a")
.Case("v7r", "v7-r")
.Case("v7m", "v7-m")
.Case("v7em", "v7e-m")
- .Cases("v8", "v8a", "v8l", "aarch64", "arm64", "v8-a")
+ .Cases({"v8", "v8a", "v8l", "aarch64", "arm64"}, "v8-a")
.Case("v8.1a", "v8.1-a")
.Case("v8.2a", "v8.2-a")
.Case("v8.3a", "v8.3-a")
@@ -39,7 +39,7 @@ StringRef ARM::getArchSynonym(StringRef Arch) {
.Case("v8.8a", "v8.8-a")
.Case("v8.9a", "v8.9-a")
.Case("v8r", "v8-r")
- .Cases("v9", "v9a", "v9-a")
+ .Cases({"v9", "v9a"}, "v9-a")
.Case("v9.1a", "v9.1-a")
.Case("v9.2a", "v9.2-a")
.Case("v9.3a", "v9.3-a")
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index a5bdc9d..6065575 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -70,8 +70,8 @@
using namespace llvm;
-static std::unique_ptr<llvm::MemoryBuffer>
- LLVM_ATTRIBUTE_UNUSED getProcCpuinfoContent() {
+[[maybe_unused]] static std::unique_ptr<llvm::MemoryBuffer>
+getProcCpuinfoContent() {
const char *CPUInfoFile = "/proc/cpuinfo";
if (const char *CpuinfoIntercept = std::getenv("LLVM_CPUINFO"))
CPUInfoFile = CpuinfoIntercept;
@@ -964,6 +964,13 @@ static StringRef getIntelProcessorTypeAndSubtype(unsigned Family,
*Subtype = X86::INTEL_COREI7_PANTHERLAKE;
break;
+ // Wildcatlake:
+ case 0xd5:
+ CPU = "wildcatlake";
+ *Type = X86::INTEL_COREI7;
+ *Subtype = X86::INTEL_COREI7_PANTHERLAKE;
+ break;
+
// Graniterapids:
case 0xad:
CPU = "graniterapids";
@@ -1145,6 +1152,20 @@ static StringRef getIntelProcessorTypeAndSubtype(unsigned Family,
break;
}
break;
+ case 0x12:
+ switch (Model) {
+ // Novalake:
+ case 0x1:
+ case 0x3:
+ CPU = "novalake";
+ *Type = X86::INTEL_COREI7;
+ *Subtype = X86::INTEL_COREI7_NOVALAKE;
+ break;
+ default: // Unknown family 0x12 CPU.
+ break;
+ }
+ break;
+
default:
break; // Unknown.
}
diff --git a/llvm/lib/TargetParser/RISCVISAInfo.cpp b/llvm/lib/TargetParser/RISCVISAInfo.cpp
index 9268df2..31126cc 100644
--- a/llvm/lib/TargetParser/RISCVISAInfo.cpp
+++ b/llvm/lib/TargetParser/RISCVISAInfo.cpp
@@ -887,7 +887,7 @@ void RISCVISAInfo::updateImplication() {
}
static constexpr StringLiteral CombineIntoExts[] = {
- {"b"}, {"zk"}, {"zkn"}, {"zks"}, {"zvkn"},
+ {"a"}, {"b"}, {"zk"}, {"zkn"}, {"zks"}, {"zvkn"},
{"zvknc"}, {"zvkng"}, {"zvks"}, {"zvksc"}, {"zvksg"},
};
diff --git a/llvm/lib/TargetParser/RISCVTargetParser.cpp b/llvm/lib/TargetParser/RISCVTargetParser.cpp
index acf8e4c..5ea63a9 100644
--- a/llvm/lib/TargetParser/RISCVTargetParser.cpp
+++ b/llvm/lib/TargetParser/RISCVTargetParser.cpp
@@ -228,6 +228,10 @@ void printVType(unsigned VType, raw_ostream &OS) {
OS << ", mu";
}
+void printXSfmmVType(unsigned VType, raw_ostream &OS) {
+ OS << "e" << getSEW(VType) << ", w" << getXSfmmWiden(VType);
+}
+
unsigned getSEWLMULRatio(unsigned SEW, VLMUL VLMul) {
unsigned LMul;
bool Fractional;
diff --git a/llvm/lib/TargetParser/TargetDataLayout.cpp b/llvm/lib/TargetParser/TargetDataLayout.cpp
index cea246e..d765d9c 100644
--- a/llvm/lib/TargetParser/TargetDataLayout.cpp
+++ b/llvm/lib/TargetParser/TargetDataLayout.cpp
@@ -258,7 +258,7 @@ static std::string computePowerDataLayout(const Triple &T) {
static std::string computeAMDDataLayout(const Triple &TT) {
if (TT.getArch() == Triple::r600) {
// 32-bit pointers.
- return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
+ return "e-m:e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
"-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1";
}
@@ -268,7 +268,7 @@ static std::string computeAMDDataLayout(const Triple &TT) {
// (address space 7), and 128-bit non-integral buffer resourcees (address
// space 8) which cannot be non-trivilally accessed by LLVM memory operations
// like getelementptr.
- return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
+ return "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
"-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-"
"v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-"
"v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9";
@@ -548,8 +548,11 @@ std::string Triple::computeDataLayout(StringRef ABIName) const {
case Triple::csky:
return computeCSKYDataLayout(*this);
case Triple::dxil:
+ // TODO: We need to align vectors on the element size generally, but for now
+ // we hard code this for 3-element 32- and 64-bit vectors as a workaround.
+ // See https://github.com/llvm/llvm-project/issues/123968
return "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-"
- "f32:32-f64:64-n8:16:32:64";
+ "f32:32-f64:64-n8:16:32:64-v48:16:16-v96:32:32-v192:64:64";
case Triple::hexagon:
return "e-m:e-p:32:32:32-a:0-n16:32-"
"i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-"
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index f021094..1068ce4 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -579,87 +579,89 @@ static Triple::ArchType parseARMArch(StringRef ArchName) {
}
static Triple::ArchType parseArch(StringRef ArchName) {
- auto AT = StringSwitch<Triple::ArchType>(ArchName)
- .Cases("i386", "i486", "i586", "i686", Triple::x86)
- // FIXME: Do we need to support these?
- .Cases("i786", "i886", "i986", Triple::x86)
- .Cases("amd64", "x86_64", "x86_64h", Triple::x86_64)
- .Cases("powerpc", "powerpcspe", "ppc", "ppc32", Triple::ppc)
- .Cases("powerpcle", "ppcle", "ppc32le", Triple::ppcle)
- .Cases("powerpc64", "ppu", "ppc64", Triple::ppc64)
- .Cases("powerpc64le", "ppc64le", Triple::ppc64le)
- .Case("xscale", Triple::arm)
- .Case("xscaleeb", Triple::armeb)
- .Case("aarch64", Triple::aarch64)
- .Case("aarch64_be", Triple::aarch64_be)
- .Case("aarch64_32", Triple::aarch64_32)
- .Case("arc", Triple::arc)
- .Case("arm64", Triple::aarch64)
- .Case("arm64_32", Triple::aarch64_32)
- .Case("arm64e", Triple::aarch64)
- .Case("arm64ec", Triple::aarch64)
- .Case("arm", Triple::arm)
- .Case("armeb", Triple::armeb)
- .Case("thumb", Triple::thumb)
- .Case("thumbeb", Triple::thumbeb)
- .Case("avr", Triple::avr)
- .Case("m68k", Triple::m68k)
- .Case("msp430", Triple::msp430)
- .Cases("mips", "mipseb", "mipsallegrex", "mipsisa32r6",
- "mipsr6", Triple::mips)
- .Cases("mipsel", "mipsallegrexel", "mipsisa32r6el", "mipsr6el",
- Triple::mipsel)
- .Cases("mips64", "mips64eb", "mipsn32", "mipsisa64r6",
- "mips64r6", "mipsn32r6", Triple::mips64)
- .Cases("mips64el", "mipsn32el", "mipsisa64r6el", "mips64r6el",
- "mipsn32r6el", Triple::mips64el)
- .Case("r600", Triple::r600)
- .Case("amdgcn", Triple::amdgcn)
- .Case("riscv32", Triple::riscv32)
- .Case("riscv64", Triple::riscv64)
- .Case("riscv32be", Triple::riscv32be)
- .Case("riscv64be", Triple::riscv64be)
- .Case("hexagon", Triple::hexagon)
- .Cases("s390x", "systemz", Triple::systemz)
- .Case("sparc", Triple::sparc)
- .Case("sparcel", Triple::sparcel)
- .Cases("sparcv9", "sparc64", Triple::sparcv9)
- .Case("tce", Triple::tce)
- .Case("tcele", Triple::tcele)
- .Case("xcore", Triple::xcore)
- .Case("nvptx", Triple::nvptx)
- .Case("nvptx64", Triple::nvptx64)
- .Case("amdil", Triple::amdil)
- .Case("amdil64", Triple::amdil64)
- .Case("hsail", Triple::hsail)
- .Case("hsail64", Triple::hsail64)
- .Case("spir", Triple::spir)
- .Case("spir64", Triple::spir64)
- .Cases("spirv", "spirv1.5", "spirv1.6", Triple::spirv)
- .Cases("spirv32", "spirv32v1.0", "spirv32v1.1", "spirv32v1.2",
- "spirv32v1.3", "spirv32v1.4", "spirv32v1.5",
- "spirv32v1.6", Triple::spirv32)
- .Cases("spirv64", "spirv64v1.0", "spirv64v1.1", "spirv64v1.2",
- "spirv64v1.3", "spirv64v1.4", "spirv64v1.5",
- "spirv64v1.6", Triple::spirv64)
- .StartsWith("kalimba", Triple::kalimba)
- .Case("lanai", Triple::lanai)
- .Case("renderscript32", Triple::renderscript32)
- .Case("renderscript64", Triple::renderscript64)
- .Case("shave", Triple::shave)
- .Case("ve", Triple::ve)
- .Case("wasm32", Triple::wasm32)
- .Case("wasm64", Triple::wasm64)
- .Case("csky", Triple::csky)
- .Case("loongarch32", Triple::loongarch32)
- .Case("loongarch64", Triple::loongarch64)
- .Cases("dxil", "dxilv1.0", "dxilv1.1", "dxilv1.2", "dxilv1.3",
- "dxilv1.4", "dxilv1.5", "dxilv1.6", "dxilv1.7",
- "dxilv1.8", Triple::dxil)
- // Note: Cases has max limit of 10.
- .Case("dxilv1.9", Triple::dxil)
- .Case("xtensa", Triple::xtensa)
- .Default(Triple::UnknownArch);
+ auto AT =
+ StringSwitch<Triple::ArchType>(ArchName)
+ .Cases({"i386", "i486", "i586", "i686"}, Triple::x86)
+ // FIXME: Do we need to support these?
+ .Cases({"i786", "i886", "i986"}, Triple::x86)
+ .Cases({"amd64", "x86_64", "x86_64h"}, Triple::x86_64)
+ .Cases({"powerpc", "powerpcspe", "ppc", "ppc32"}, Triple::ppc)
+ .Cases({"powerpcle", "ppcle", "ppc32le"}, Triple::ppcle)
+ .Cases({"powerpc64", "ppu", "ppc64"}, Triple::ppc64)
+ .Cases({"powerpc64le", "ppc64le"}, Triple::ppc64le)
+ .Case("xscale", Triple::arm)
+ .Case("xscaleeb", Triple::armeb)
+ .Case("aarch64", Triple::aarch64)
+ .Case("aarch64_be", Triple::aarch64_be)
+ .Case("aarch64_32", Triple::aarch64_32)
+ .Case("arc", Triple::arc)
+ .Case("arm64", Triple::aarch64)
+ .Case("arm64_32", Triple::aarch64_32)
+ .Case("arm64e", Triple::aarch64)
+ .Case("arm64ec", Triple::aarch64)
+ .Case("arm", Triple::arm)
+ .Case("armeb", Triple::armeb)
+ .Case("thumb", Triple::thumb)
+ .Case("thumbeb", Triple::thumbeb)
+ .Case("avr", Triple::avr)
+ .Case("m68k", Triple::m68k)
+ .Case("msp430", Triple::msp430)
+ .Cases({"mips", "mipseb", "mipsallegrex", "mipsisa32r6", "mipsr6"},
+ Triple::mips)
+ .Cases({"mipsel", "mipsallegrexel", "mipsisa32r6el", "mipsr6el"},
+ Triple::mipsel)
+ .Cases({"mips64", "mips64eb", "mipsn32", "mipsisa64r6", "mips64r6",
+ "mipsn32r6"},
+ Triple::mips64)
+ .Cases({"mips64el", "mipsn32el", "mipsisa64r6el", "mips64r6el",
+ "mipsn32r6el"},
+ Triple::mips64el)
+ .Case("r600", Triple::r600)
+ .Case("amdgcn", Triple::amdgcn)
+ .Case("riscv32", Triple::riscv32)
+ .Case("riscv64", Triple::riscv64)
+ .Case("riscv32be", Triple::riscv32be)
+ .Case("riscv64be", Triple::riscv64be)
+ .Case("hexagon", Triple::hexagon)
+ .Cases({"s390x", "systemz"}, Triple::systemz)
+ .Case("sparc", Triple::sparc)
+ .Case("sparcel", Triple::sparcel)
+ .Cases({"sparcv9", "sparc64"}, Triple::sparcv9)
+ .Case("tce", Triple::tce)
+ .Case("tcele", Triple::tcele)
+ .Case("xcore", Triple::xcore)
+ .Case("nvptx", Triple::nvptx)
+ .Case("nvptx64", Triple::nvptx64)
+ .Case("amdil", Triple::amdil)
+ .Case("amdil64", Triple::amdil64)
+ .Case("hsail", Triple::hsail)
+ .Case("hsail64", Triple::hsail64)
+ .Case("spir", Triple::spir)
+ .Case("spir64", Triple::spir64)
+ .Cases({"spirv", "spirv1.5", "spirv1.6"}, Triple::spirv)
+ .Cases({"spirv32", "spirv32v1.0", "spirv32v1.1", "spirv32v1.2",
+ "spirv32v1.3", "spirv32v1.4", "spirv32v1.5", "spirv32v1.6"},
+ Triple::spirv32)
+ .Cases({"spirv64", "spirv64v1.0", "spirv64v1.1", "spirv64v1.2",
+ "spirv64v1.3", "spirv64v1.4", "spirv64v1.5", "spirv64v1.6"},
+ Triple::spirv64)
+ .StartsWith("kalimba", Triple::kalimba)
+ .Case("lanai", Triple::lanai)
+ .Case("renderscript32", Triple::renderscript32)
+ .Case("renderscript64", Triple::renderscript64)
+ .Case("shave", Triple::shave)
+ .Case("ve", Triple::ve)
+ .Case("wasm32", Triple::wasm32)
+ .Case("wasm64", Triple::wasm64)
+ .Case("csky", Triple::csky)
+ .Case("loongarch32", Triple::loongarch32)
+ .Case("loongarch64", Triple::loongarch64)
+ .Cases({"dxil", "dxilv1.0", "dxilv1.1", "dxilv1.2", "dxilv1.3",
+ "dxilv1.4", "dxilv1.5", "dxilv1.6", "dxilv1.7", "dxilv1.8",
+ "dxilv1.9"},
+ Triple::dxil)
+ .Case("xtensa", Triple::xtensa)
+ .Default(Triple::UnknownArch);
// Some architectures require special parsing logic just to compute the
// ArchType result.
@@ -1071,7 +1073,7 @@ Triple::Triple(std::string &&Str) : Data(std::move(Str)) {
.StartsWith("mips64", Triple::GNUABI64)
.StartsWith("mipsisa64", Triple::GNUABI64)
.StartsWith("mipsisa32", Triple::GNU)
- .Cases("mips", "mipsel", "mipsr6", "mipsr6el", Triple::GNU)
+ .Cases({"mips", "mipsel", "mipsr6", "mipsr6el"}, Triple::GNU)
.Default(UnknownEnvironment);
}
}
diff --git a/llvm/lib/TargetParser/Unix/Host.inc b/llvm/lib/TargetParser/Unix/Host.inc
index aeb2f59..38b942d 100644
--- a/llvm/lib/TargetParser/Unix/Host.inc
+++ b/llvm/lib/TargetParser/Unix/Host.inc
@@ -59,10 +59,30 @@ static std::string updateTripleOSVersion(std::string TargetTripleString) {
if (TT.getOS() == Triple::AIX && !TT.getOSMajorVersion()) {
struct utsname name;
if (uname(&name) != -1) {
+ std::string release = name.release;
+
+ if (strcmp(name.sysname, "OS400") == 0) {
+ /*
+ PASE uses different versioning system than AIX.
+ The following table shows the currently supported PASE
+ releases and the corresponding AIX release:
+ --------------------------
+ PASE | AIX
+ --------------------------
+ V7R4 | 7.2 (TL2)
+ --------------------------
+ V7R5 | 7.2 (TL5)
+ --------------------------
+ V7R6 | 7.3 (TL1)
+ --------------------------
+ */
+ release = (release == "4" || release == "5") ? "2" : "3";
+ }
+
std::string NewOSName = std::string(Triple::getOSTypeName(Triple::AIX));
NewOSName += name.version;
NewOSName += '.';
- NewOSName += name.release;
+ NewOSName += release;
NewOSName += ".0.0";
TT.setOSName(NewOSName);
return TT.str();
diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp
index edca7c1..dd13ce3 100644
--- a/llvm/lib/TargetParser/X86TargetParser.cpp
+++ b/llvm/lib/TargetParser/X86TargetParser.cpp
@@ -175,7 +175,9 @@ constexpr FeatureBitset FeaturesArrowlakeS =
FeaturesArrowlake | FeatureAVXVNNIINT16 | FeatureSHA512 | FeatureSM3 |
FeatureSM4;
constexpr FeatureBitset FeaturesPantherlake =
- (FeaturesArrowlakeS ^ FeatureWIDEKL) | FeaturePREFETCHI;
+ (FeaturesArrowlakeS ^ FeatureWIDEKL);
+constexpr FeatureBitset FeaturesNovalake =
+ FeaturesPantherlake | FeaturePREFETCHI;
constexpr FeatureBitset FeaturesClearwaterforest =
(FeaturesSierraforest ^ FeatureWIDEKL) | FeatureAVXVNNIINT16 |
FeatureSHA512 | FeatureSM3 | FeatureSM4 | FeaturePREFETCHI | FeatureUSERMSR;
@@ -378,6 +380,9 @@ constexpr ProcInfo Processors[] = {
{ {"gracemont"}, CK_Gracemont, FEATURE_AVX2, FeaturesAlderlake, 'p', false },
// Pantherlake microarchitecture based processors.
{ {"pantherlake"}, CK_Lunarlake, FEATURE_AVX2, FeaturesPantherlake, 'p', false },
+ { {"wildcatlake"}, CK_Lunarlake, FEATURE_AVX2, FeaturesPantherlake, 'p', false },
+ // Novalake microarchitecture based processors.
+ { {"novalake"}, CK_Novalake, FEATURE_AVX2, FeaturesNovalake, 'r', false },
// Sierraforest microarchitecture based processors.
{ {"sierraforest"}, CK_Sierraforest, FEATURE_AVX2, FeaturesSierraforest, 'p', false },
// Grandridge microarchitecture based processors.
diff --git a/llvm/lib/Transforms/CFGuard/CFGuard.cpp b/llvm/lib/Transforms/CFGuard/CFGuard.cpp
index b73a0ce..4645670 100644
--- a/llvm/lib/Transforms/CFGuard/CFGuard.cpp
+++ b/llvm/lib/Transforms/CFGuard/CFGuard.cpp
@@ -147,7 +147,7 @@ public:
private:
// Only add checks if the module has the cfguard=2 flag.
- int cfguard_module_flag = 0;
+ int CFGuardModuleFlag = 0;
StringRef GuardFnName;
Mechanism GuardMechanism = Mechanism::Check;
FunctionType *GuardFnType = nullptr;
@@ -162,9 +162,7 @@ public:
static char ID;
// Default constructor required for the INITIALIZE_PASS macro.
- CFGuard(CFGuardImpl::Mechanism M) : FunctionPass(ID), Impl(M) {
- initializeCFGuardPass(*PassRegistry::getPassRegistry());
- }
+ CFGuard(CFGuardImpl::Mechanism M) : FunctionPass(ID), Impl(M) {}
bool doInitialization(Module &M) override { return Impl.doInitialization(M); }
bool runOnFunction(Function &F) override { return Impl.runOnFunction(F); }
@@ -173,7 +171,6 @@ public:
} // end anonymous namespace
void CFGuardImpl::insertCFGuardCheck(CallBase *CB) {
-
assert(CB->getModule()->getTargetTriple().isOSWindows() &&
"Only applicable for Windows targets");
assert(CB->isIndirectCall() &&
@@ -202,7 +199,6 @@ void CFGuardImpl::insertCFGuardCheck(CallBase *CB) {
}
void CFGuardImpl::insertCFGuardDispatch(CallBase *CB) {
-
assert(CB->getModule()->getTargetTriple().isOSWindows() &&
"Only applicable for Windows targets");
assert(CB->isIndirectCall() &&
@@ -236,14 +232,13 @@ void CFGuardImpl::insertCFGuardDispatch(CallBase *CB) {
}
bool CFGuardImpl::doInitialization(Module &M) {
-
// Check if this module has the cfguard flag and read its value.
if (auto *MD =
mdconst::extract_or_null<ConstantInt>(M.getModuleFlag("cfguard")))
- cfguard_module_flag = MD->getZExtValue();
+ CFGuardModuleFlag = MD->getZExtValue();
// Skip modules for which CFGuard checks have been disabled.
- if (cfguard_module_flag != 2)
+ if (CFGuardModuleFlag != 2)
return false;
// Set up prototypes for the guard check and dispatch functions.
@@ -264,9 +259,8 @@ bool CFGuardImpl::doInitialization(Module &M) {
}
bool CFGuardImpl::runOnFunction(Function &F) {
-
// Skip modules for which CFGuard checks have been disabled.
- if (cfguard_module_flag != 2)
+ if (CFGuardModuleFlag != 2)
return false;
SmallVector<CallBase *, 8> IndirectCalls;
@@ -286,19 +280,16 @@ bool CFGuardImpl::runOnFunction(Function &F) {
}
// If no checks are needed, return early.
- if (IndirectCalls.empty()) {
+ if (IndirectCalls.empty())
return false;
- }
// For each indirect call/invoke, add the appropriate dispatch or check.
if (GuardMechanism == Mechanism::Dispatch) {
- for (CallBase *CB : IndirectCalls) {
+ for (CallBase *CB : IndirectCalls)
insertCFGuardDispatch(CB);
- }
} else {
- for (CallBase *CB : IndirectCalls) {
+ for (CallBase *CB : IndirectCalls)
insertCFGuardCheck(CB);
- }
}
return true;
diff --git a/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp b/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp
index f166fef..cf7e450 100644
--- a/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp
@@ -153,26 +153,23 @@ PreservedAnalyses CoroAnnotationElidePass::run(LazyCallGraph::SCC &C,
bool IsCallerPresplitCoroutine = Caller->isPresplitCoroutine();
bool HasAttr = CB->hasFnAttr(llvm::Attribute::CoroElideSafe);
if (IsCallerPresplitCoroutine && HasAttr) {
- BranchProbability MinBranchProbability(
- static_cast<int>(CoroElideBranchRatio * MinBlockCounterExecution),
- MinBlockCounterExecution);
-
auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(*Caller);
- auto Prob = BranchProbability::getBranchProbability(
- BFI.getBlockFreq(CB->getParent()).getFrequency(),
- BFI.getEntryFreq().getFrequency());
+ auto BlockFreq = BFI.getBlockFreq(CB->getParent()).getFrequency();
+ auto EntryFreq = BFI.getEntryFreq().getFrequency();
+ uint64_t MinFreq =
+ static_cast<uint64_t>(EntryFreq * CoroElideBranchRatio);
- if (Prob < MinBranchProbability) {
+ if (BlockFreq < MinFreq) {
ORE.emit([&]() {
return OptimizationRemarkMissed(
DEBUG_TYPE, "CoroAnnotationElideUnlikely", Caller)
<< "'" << ore::NV("callee", Callee->getName())
<< "' not elided in '"
<< ore::NV("caller", Caller->getName())
- << "' because of low probability: "
- << ore::NV("probability", Prob) << " (threshold: "
- << ore::NV("threshold", MinBranchProbability) << ")";
+ << "' because of low frequency: "
+ << ore::NV("block_freq", BlockFreq)
+ << " (threshold: " << ore::NV("min_freq", MinFreq) << ")";
});
continue;
}
@@ -188,7 +185,8 @@ PreservedAnalyses CoroAnnotationElidePass::run(LazyCallGraph::SCC &C,
return OptimizationRemark(DEBUG_TYPE, "CoroAnnotationElide", Caller)
<< "'" << ore::NV("callee", Callee->getName())
<< "' elided in '" << ore::NV("caller", Caller->getName())
- << "' (probability: " << ore::NV("probability", Prob) << ")";
+ << "' (block_freq: " << ore::NV("block_freq", BlockFreq)
+ << ")";
});
FAM.invalidate(*Caller, PreservedAnalyses::none());
diff --git a/llvm/lib/Transforms/Coroutines/CoroCloner.h b/llvm/lib/Transforms/Coroutines/CoroCloner.h
index 26ec4f3..e05fe28 100644
--- a/llvm/lib/Transforms/Coroutines/CoroCloner.h
+++ b/llvm/lib/Transforms/Coroutines/CoroCloner.h
@@ -1,3 +1,4 @@
+//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -19,9 +20,7 @@
#include "llvm/Transforms/Coroutines/CoroInstr.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
-namespace llvm {
-
-namespace coro {
+namespace llvm::coro {
enum class CloneKind {
/// The shared resume function for a switch lowering.
@@ -149,8 +148,6 @@ public:
}
};
-} // end namespace coro
-
-} // end namespace llvm
+} // end namespace llvm::coro
#endif // LLVM_LIB_TRANSFORMS_COROUTINES_COROCLONER_H
diff --git a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
index 471b9eb..cdb5852 100644
--- a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
@@ -38,7 +38,7 @@ public:
AnyResumeFnPtrTy(PointerType::getUnqual(Context)) {}
void lowerEarlyIntrinsics(Function &F);
};
-}
+} // namespace
// Replace a direct call to coro.resume or coro.destroy with an indirect call to
// an address returned by coro.subfn.addr intrinsic. This is done so that
diff --git a/llvm/lib/Transforms/Coroutines/CoroInternal.h b/llvm/lib/Transforms/Coroutines/CoroInternal.h
index 52f4ffe..cc47a55 100644
--- a/llvm/lib/Transforms/Coroutines/CoroInternal.h
+++ b/llvm/lib/Transforms/Coroutines/CoroInternal.h
@@ -16,11 +16,7 @@
#include "llvm/Transforms/Coroutines/CoroInstr.h"
#include "llvm/Transforms/Coroutines/CoroShape.h"
-namespace llvm {
-
-class CallGraph;
-
-namespace coro {
+namespace llvm::coro {
bool isSuspendBlock(BasicBlock *BB);
bool declaresAnyIntrinsic(const Module &M);
@@ -61,7 +57,6 @@ void normalizeCoroutine(Function &F, coro::Shape &Shape,
CallInst *createMustTailCall(DebugLoc Loc, Function *MustTailCallFn,
TargetTransformInfo &TTI,
ArrayRef<Value *> Arguments, IRBuilder<> &);
-} // End namespace coro.
-} // End namespace llvm
+} // End namespace llvm::coro
#endif
diff --git a/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp b/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp
index 6aaabca..f2444da 100644
--- a/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp
+++ b/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp
@@ -137,8 +137,7 @@ struct RematGraph {
} // namespace
-namespace llvm {
-template <> struct GraphTraits<RematGraph *> {
+template <> struct llvm::GraphTraits<RematGraph *> {
using NodeRef = RematGraph::RematNode *;
using ChildIteratorType = RematGraph::RematNode **;
@@ -149,8 +148,6 @@ template <> struct GraphTraits<RematGraph *> {
static ChildIteratorType child_end(NodeRef N) { return N->Operands.end(); }
};
-} // end namespace llvm
-
// For each instruction identified as materializable across the suspend point,
// and its associated DAG of other rematerializable instructions,
// recreate the DAG of instructions after the suspend point.
diff --git a/llvm/lib/Transforms/Coroutines/SpillUtils.cpp b/llvm/lib/Transforms/Coroutines/SpillUtils.cpp
index e474c07..81fe0c9 100644
--- a/llvm/lib/Transforms/Coroutines/SpillUtils.cpp
+++ b/llvm/lib/Transforms/Coroutines/SpillUtils.cpp
@@ -16,11 +16,8 @@
#include "llvm/IR/InstIterator.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-namespace llvm {
-
-namespace coro {
-
-namespace {
+using namespace llvm;
+using namespace llvm::coro;
typedef SmallPtrSet<BasicBlock *, 8> VisitedBlocksSet;
@@ -71,7 +68,7 @@ static bool isLocalAlloca(CoroAllocaAllocInst *AI) {
/// This happens during the all-instructions iteration, so it must not
/// delete the call.
static Instruction *
-lowerNonLocalAlloca(CoroAllocaAllocInst *AI, const coro::Shape &Shape,
+lowerNonLocalAlloca(CoroAllocaAllocInst *AI, const Shape &Shape,
SmallVectorImpl<Instruction *> &DeadInsts) {
IRBuilder<> Builder(AI);
auto Alloc = Shape.emitAlloc(Builder, AI->getSize(), nullptr);
@@ -450,10 +447,8 @@ static void collectFrameAlloca(AllocaInst *AI, const coro::Shape &Shape,
Visitor.getMayWriteBeforeCoroBegin());
}
-} // namespace
-
-void collectSpillsFromArgs(SpillInfo &Spills, Function &F,
- const SuspendCrossingInfo &Checker) {
+void coro::collectSpillsFromArgs(SpillInfo &Spills, Function &F,
+ const SuspendCrossingInfo &Checker) {
// Collect the spills for arguments and other not-materializable values.
for (Argument &A : F.args())
for (User *U : A.users())
@@ -461,7 +456,7 @@ void collectSpillsFromArgs(SpillInfo &Spills, Function &F,
Spills[&A].push_back(cast<Instruction>(U));
}
-void collectSpillsAndAllocasFromInsts(
+void coro::collectSpillsAndAllocasFromInsts(
SpillInfo &Spills, SmallVector<AllocaInfo, 8> &Allocas,
SmallVector<Instruction *, 4> &DeadInstructions,
SmallVector<CoroAllocaAllocInst *, 4> &LocalAllocas, Function &F,
@@ -516,8 +511,8 @@ void collectSpillsAndAllocasFromInsts(
}
}
-void collectSpillsFromDbgInfo(SpillInfo &Spills, Function &F,
- const SuspendCrossingInfo &Checker) {
+void coro::collectSpillsFromDbgInfo(SpillInfo &Spills, Function &F,
+ const SuspendCrossingInfo &Checker) {
// We don't want the layout of coroutine frame to be affected
// by debug information. So we only choose to salvage dbg.values for
// whose value is already in the frame.
@@ -535,10 +530,9 @@ void collectSpillsFromDbgInfo(SpillInfo &Spills, Function &F,
/// Async and Retcon{Once} conventions assume that all spill uses can be sunk
/// after the coro.begin intrinsic.
-void sinkSpillUsesAfterCoroBegin(const DominatorTree &Dom,
- CoroBeginInst *CoroBegin,
- coro::SpillInfo &Spills,
- SmallVectorImpl<coro::AllocaInfo> &Allocas) {
+void coro::sinkSpillUsesAfterCoroBegin(
+ const DominatorTree &Dom, CoroBeginInst *CoroBegin, coro::SpillInfo &Spills,
+ SmallVectorImpl<coro::AllocaInfo> &Allocas) {
SmallSetVector<Instruction *, 32> ToMove;
SmallVector<Instruction *, 32> Worklist;
@@ -582,8 +576,9 @@ void sinkSpillUsesAfterCoroBegin(const DominatorTree &Dom,
Inst->moveBefore(InsertPt->getIterator());
}
-BasicBlock::iterator getSpillInsertionPt(const coro::Shape &Shape, Value *Def,
- const DominatorTree &DT) {
+BasicBlock::iterator coro::getSpillInsertionPt(const coro::Shape &Shape,
+ Value *Def,
+ const DominatorTree &DT) {
BasicBlock::iterator InsertPt;
if (auto *Arg = dyn_cast<Argument>(Def)) {
// For arguments, we will place the store instruction right after
@@ -625,7 +620,3 @@ BasicBlock::iterator getSpillInsertionPt(const coro::Shape &Shape, Value *Def,
return InsertPt;
}
-
-} // End namespace coro.
-
-} // End namespace llvm.
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 5066a99..894d83f 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -6150,3 +6150,42 @@ void MemProfContextDisambiguation::run(
IndexCallsiteContextGraph CCG(Index, isPrevailing);
CCG.process();
}
+
+// Strips MemProf attributes and metadata. Can be invoked by the pass pipeline
+// when we don't have an index that has recorded that we are linking with
+// allocation libraries containing the necessary APIs for downstream
+// transformations.
+PreservedAnalyses MemProfRemoveInfo::run(Module &M, ModuleAnalysisManager &AM) {
+ // The profile matcher applies hotness attributes directly for allocations,
+ // and those will cause us to generate calls to the hot/cold interfaces
+ // unconditionally. If supports-hot-cold-new was not enabled in the LTO
+ // link then assume we don't want these calls (e.g. not linking with
+ // the appropriate library, or otherwise trying to disable this behavior).
+ bool Changed = false;
+ for (auto &F : M) {
+ for (auto &BB : F) {
+ for (auto &I : BB) {
+ auto *CI = dyn_cast<CallBase>(&I);
+ if (!CI)
+ continue;
+ if (CI->hasFnAttr("memprof")) {
+ CI->removeFnAttr("memprof");
+ Changed = true;
+ }
+ if (!CI->hasMetadata(LLVMContext::MD_callsite)) {
+ assert(!CI->hasMetadata(LLVMContext::MD_memprof));
+ continue;
+ }
+ // Strip off all memprof metadata as it is no longer needed.
+ // Importantly, this avoids the addition of new memprof attributes
+ // after inlining propagation.
+ CI->setMetadata(LLVMContext::MD_memprof, nullptr);
+ CI->setMetadata(LLVMContext::MD_callsite, nullptr);
+ Changed = true;
+ }
+ }
+ }
+ if (!Changed)
+ return PreservedAnalyses::all();
+ return PreservedAnalyses::none();
+}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 4c9b10a..cdc559b 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -156,9 +156,9 @@ Instruction *InstCombinerImpl::commonCastTransforms(CastInst &CI) {
Value *Src = CI.getOperand(0);
Type *Ty = CI.getType();
- if (auto *SrcC = dyn_cast<Constant>(Src))
- if (Constant *Res = ConstantFoldCastOperand(CI.getOpcode(), SrcC, Ty, DL))
- return replaceInstUsesWith(CI, Res);
+ if (Value *Res =
+ simplifyCastInst(CI.getOpcode(), Src, Ty, SQ.getWithInstruction(&CI)))
+ return replaceInstUsesWith(CI, Res);
// Try to eliminate a cast of a cast.
if (auto *CSrc = dyn_cast<CastInst>(Src)) { // A->B->C cast
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 07ad65c..fba1ccf 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -1481,13 +1481,13 @@ Instruction *InstCombinerImpl::foldICmpTruncConstant(ICmpInst &Cmp,
return new ICmpInst(Pred, Y, ConstantInt::get(SrcTy, C.logBase2()));
}
- if (Cmp.isEquality() && Trunc->hasOneUse()) {
+ if (Cmp.isEquality() && (Trunc->hasOneUse() || Trunc->hasNoUnsignedWrap())) {
// Canonicalize to a mask and wider compare if the wide type is suitable:
// (trunc X to i8) == C --> (X & 0xff) == (zext C)
if (!SrcTy->isVectorTy() && shouldChangeType(DstBits, SrcBits)) {
Constant *Mask =
ConstantInt::get(SrcTy, APInt::getLowBitsSet(SrcBits, DstBits));
- Value *And = Builder.CreateAnd(X, Mask);
+ Value *And = Trunc->hasNoUnsignedWrap() ? X : Builder.CreateAnd(X, Mask);
Constant *WideC = ConstantInt::get(SrcTy, C.zext(SrcBits));
return new ICmpInst(Pred, And, WideC);
}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 7071876..943c223 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -471,7 +471,6 @@ private:
Value *simplifyNonNullOperand(Value *V, bool HasDereferenceable,
unsigned Depth = 0);
-public:
/// Create `select C, S1, S2`. Use only when the profile cannot be calculated
/// from existing profile metadata: if the Function has profiles, this will
/// set the profile of this select to "unknown".
@@ -484,6 +483,7 @@ public:
return Sel;
}
+public:
/// Create and insert the idiom we use to indicate a block is unreachable
/// without having to rewrite the CFG from within InstCombine.
void CreateNonTerminatorUnreachable(Instruction *InsertAt) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 6b67b48..09cb225 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -2979,10 +2979,14 @@ Instruction *InstCombinerImpl::foldAndOrOfSelectUsingImpliedCond(Value *Op,
"Op must be either i1 or vector of i1.");
if (SI.getCondition()->getType() != Op->getType())
return nullptr;
- if (Value *V = simplifyNestedSelectsUsingImpliedCond(SI, Op, IsAnd, DL))
- return SelectInst::Create(Op,
- IsAnd ? V : ConstantInt::getTrue(Op->getType()),
- IsAnd ? ConstantInt::getFalse(Op->getType()) : V);
+ if (Value *V = simplifyNestedSelectsUsingImpliedCond(SI, Op, IsAnd, DL)) {
+ Instruction *MDFrom = nullptr;
+ if (!ProfcheckDisableMetadataFixes)
+ MDFrom = &SI;
+ return SelectInst::Create(
+ Op, IsAnd ? V : ConstantInt::getTrue(Op->getType()),
+ IsAnd ? ConstantInt::getFalse(Op->getType()) : V, "", nullptr, MDFrom);
+ }
return nullptr;
}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 63e24a0..a330bb7 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -110,8 +110,8 @@ static Value *simplifyShiftSelectingPackedElement(Instruction *I,
ShrAmt->getName() + ".z");
// There is no existing !prof metadata we can derive the !prof metadata for
// this select.
- Value *Select = IC.createSelectInstWithUnknownProfile(ShrAmtZ, Lower, Upper);
- IC.Builder.Insert(Select);
+ Value *Select = IC.Builder.CreateSelectWithUnknownProfile(ShrAmtZ, Lower,
+ Upper, DEBUG_TYPE);
Select->takeName(I);
return Select;
}
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 82ac903..3f11cae 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1690,6 +1690,11 @@ Instruction *InstCombinerImpl::foldFBinOpOfIntCastsFromSign(
// 2) (fp_binop ({s|u}itofp x), FpC)
// -> ({s|u}itofp (int_binop x, (fpto{s|u}i FpC)))
Instruction *InstCombinerImpl::foldFBinOpOfIntCasts(BinaryOperator &BO) {
+ // Don't perform the fold on vectors, as the integer operation may be much
+ // more expensive than the float operation in that case.
+ if (BO.getType()->isVectorTy())
+ return nullptr;
+
std::array<Value *, 2> IntOps = {nullptr, nullptr};
Constant *Op1FpC = nullptr;
// Check for:
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 860f8f7..2646334 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -605,17 +605,16 @@ static ShadowMapping getShadowMapping(const Triple &TargetTriple, int LongSize,
return Mapping;
}
-namespace llvm {
-void getAddressSanitizerParams(const Triple &TargetTriple, int LongSize,
- bool IsKasan, uint64_t *ShadowBase,
- int *MappingScale, bool *OrShadowOffset) {
+void llvm::getAddressSanitizerParams(const Triple &TargetTriple, int LongSize,
+ bool IsKasan, uint64_t *ShadowBase,
+ int *MappingScale, bool *OrShadowOffset) {
auto Mapping = getShadowMapping(TargetTriple, LongSize, IsKasan);
*ShadowBase = Mapping.Offset;
*MappingScale = Mapping.Scale;
*OrShadowOffset = Mapping.OrShadowOffset;
}
-void removeASanIncompatibleFnAttributes(Function &F, bool ReadsArgMem) {
+void llvm::removeASanIncompatibleFnAttributes(Function &F, bool ReadsArgMem) {
// Sanitizer checks read from shadow, which invalidates memory(argmem: *).
//
// This is not only true for sanitized functions, because AttrInfer can
@@ -668,8 +667,6 @@ ASanAccessInfo::ASanAccessInfo(bool IsWrite, bool CompileKernel,
AccessSizeIndex(AccessSizeIndex), IsWrite(IsWrite),
CompileKernel(CompileKernel) {}
-} // namespace llvm
-
static uint64_t getRedzoneSizeForScale(int MappingScale) {
// Redzone used for stack and globals is at least 32 bytes.
// For scales 6 and 7, the redzone has to be 64 and 128 bytes respectively.
@@ -677,11 +674,10 @@ static uint64_t getRedzoneSizeForScale(int MappingScale) {
}
static uint64_t GetCtorAndDtorPriority(Triple &TargetTriple) {
- if (TargetTriple.isOSEmscripten()) {
+ if (TargetTriple.isOSEmscripten())
return kAsanEmscriptenCtorAndDtorPriority;
- } else {
+ else
return kAsanCtorAndDtorPriority;
- }
}
static Twine genName(StringRef suffix) {
@@ -848,6 +844,7 @@ struct AddressSanitizer {
bool maybeInsertAsanInitAtFunctionEntry(Function &F);
bool maybeInsertDynamicShadowAtFunctionEntry(Function &F);
void markEscapedLocalAllocas(Function &F);
+ void markCatchParametersAsUninteresting(Function &F);
private:
friend struct FunctionStackPoisoner;
@@ -3001,6 +2998,22 @@ void AddressSanitizer::markEscapedLocalAllocas(Function &F) {
}
}
}
+// Mitigation for https://github.com/google/sanitizers/issues/749
+// We don't instrument Windows catch-block parameters to avoid
+// interfering with exception handling assumptions.
+void AddressSanitizer::markCatchParametersAsUninteresting(Function &F) {
+ for (BasicBlock &BB : F) {
+ for (Instruction &I : BB) {
+ if (auto *CatchPad = dyn_cast<CatchPadInst>(&I)) {
+ // Mark the parameters to a catch-block as uninteresting to avoid
+ // instrumenting them.
+ for (Value *Operand : CatchPad->arg_operands())
+ if (auto *AI = dyn_cast<AllocaInst>(Operand))
+ ProcessedAllocas[AI] = false;
+ }
+ }
+ }
+}
bool AddressSanitizer::suppressInstrumentationSiteForDebug(int &Instrumented) {
bool ShouldInstrument =
@@ -3045,6 +3058,9 @@ bool AddressSanitizer::instrumentFunction(Function &F,
// can be passed to that intrinsic.
markEscapedLocalAllocas(F);
+ if (TargetTriple.isOSWindows())
+ markCatchParametersAsUninteresting(F);
+
// We want to instrument every address only once per basic block (unless there
// are calls between uses).
SmallPtrSet<Value *, 16> TempsToInstrument;
@@ -3337,7 +3353,7 @@ PHINode *FunctionStackPoisoner::createPHI(IRBuilder<> &IRB, Value *Cond,
Value *ValueIfTrue,
Instruction *ThenTerm,
Value *ValueIfFalse) {
- PHINode *PHI = IRB.CreatePHI(IntptrTy, 2);
+ PHINode *PHI = IRB.CreatePHI(ValueIfTrue->getType(), 2);
BasicBlock *CondBlock = cast<Instruction>(Cond)->getParent();
PHI->addIncoming(ValueIfFalse, CondBlock);
BasicBlock *ThenBlock = ThenTerm->getParent();
@@ -3360,7 +3376,7 @@ Value *FunctionStackPoisoner::createAllocaForLayout(
assert((ClRealignStack & (ClRealignStack - 1)) == 0);
uint64_t FrameAlignment = std::max(L.FrameAlignment, uint64_t(ClRealignStack));
Alloca->setAlignment(Align(FrameAlignment));
- return IRB.CreatePointerCast(Alloca, IntptrTy);
+ return Alloca;
}
void FunctionStackPoisoner::createDynamicAllocasInitStorage() {
@@ -3572,10 +3588,12 @@ void FunctionStackPoisoner::processStaticAllocas() {
DoDynamicAlloca &= !HasInlineAsm && !HasReturnsTwiceCall;
DoStackMalloc &= !HasInlineAsm && !HasReturnsTwiceCall;
+ Type *PtrTy = F.getDataLayout().getAllocaPtrType(F.getContext());
Value *StaticAlloca =
DoDynamicAlloca ? nullptr : createAllocaForLayout(IRB, L, false);
- Value *FakeStack;
+ Value *FakeStackPtr;
+ Value *FakeStackInt;
Value *LocalStackBase;
Value *LocalStackBaseAlloca;
uint8_t DIExprFlags = DIExpression::ApplyOffset;
@@ -3603,20 +3621,21 @@ void FunctionStackPoisoner::processStaticAllocas() {
RTCI.createRuntimeCall(IRBIf, AsanStackMallocFunc[StackMallocIdx],
ConstantInt::get(IntptrTy, LocalStackSize));
IRB.SetInsertPoint(InsBefore);
- FakeStack = createPHI(IRB, UseAfterReturnIsEnabled, FakeStackValue, Term,
- ConstantInt::get(IntptrTy, 0));
+ FakeStackInt = createPHI(IRB, UseAfterReturnIsEnabled, FakeStackValue,
+ Term, ConstantInt::get(IntptrTy, 0));
} else {
// assert(ASan.UseAfterReturn == AsanDetectStackUseAfterReturnMode:Always)
// void *FakeStack = __asan_stack_malloc_N(LocalStackSize);
// void *LocalStackBase = (FakeStack) ? FakeStack :
// alloca(LocalStackSize);
StackMallocIdx = StackMallocSizeClass(LocalStackSize);
- FakeStack =
+ FakeStackInt =
RTCI.createRuntimeCall(IRB, AsanStackMallocFunc[StackMallocIdx],
ConstantInt::get(IntptrTy, LocalStackSize));
}
+ FakeStackPtr = IRB.CreateIntToPtr(FakeStackInt, PtrTy);
Value *NoFakeStack =
- IRB.CreateICmpEQ(FakeStack, Constant::getNullValue(IntptrTy));
+ IRB.CreateICmpEQ(FakeStackInt, Constant::getNullValue(IntptrTy));
Instruction *Term =
SplitBlockAndInsertIfThen(NoFakeStack, InsBefore, false);
IRBuilder<> IRBIf(Term);
@@ -3624,67 +3643,53 @@ void FunctionStackPoisoner::processStaticAllocas() {
DoDynamicAlloca ? createAllocaForLayout(IRBIf, L, true) : StaticAlloca;
IRB.SetInsertPoint(InsBefore);
- LocalStackBase = createPHI(IRB, NoFakeStack, AllocaValue, Term, FakeStack);
+ LocalStackBase =
+ createPHI(IRB, NoFakeStack, AllocaValue, Term, FakeStackPtr);
IRB.CreateStore(LocalStackBase, LocalStackBaseAlloca);
DIExprFlags |= DIExpression::DerefBefore;
} else {
// void *FakeStack = nullptr;
// void *LocalStackBase = alloca(LocalStackSize);
- FakeStack = ConstantInt::get(IntptrTy, 0);
+ FakeStackInt = Constant::getNullValue(IntptrTy);
+ FakeStackPtr = Constant::getNullValue(PtrTy);
LocalStackBase =
DoDynamicAlloca ? createAllocaForLayout(IRB, L, true) : StaticAlloca;
LocalStackBaseAlloca = LocalStackBase;
}
- // It shouldn't matter whether we pass an `alloca` or a `ptrtoint` as the
- // dbg.declare address opereand, but passing a `ptrtoint` seems to confuse
- // later passes and can result in dropped variable coverage in debug info.
- Value *LocalStackBaseAllocaPtr =
- isa<PtrToIntInst>(LocalStackBaseAlloca)
- ? cast<PtrToIntInst>(LocalStackBaseAlloca)->getPointerOperand()
- : LocalStackBaseAlloca;
- assert(isa<AllocaInst>(LocalStackBaseAllocaPtr) &&
- "Variable descriptions relative to ASan stack base will be dropped");
-
// Replace Alloca instructions with base+offset.
SmallVector<Value *> NewAllocaPtrs;
for (const auto &Desc : SVD) {
AllocaInst *AI = Desc.AI;
- replaceDbgDeclare(AI, LocalStackBaseAllocaPtr, DIB, DIExprFlags,
- Desc.Offset);
- Value *NewAllocaPtr = IRB.CreateIntToPtr(
- IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy, Desc.Offset)),
- AI->getType());
+ replaceDbgDeclare(AI, LocalStackBaseAlloca, DIB, DIExprFlags, Desc.Offset);
+ Value *NewAllocaPtr = IRB.CreatePtrAdd(
+ LocalStackBase, ConstantInt::get(IntptrTy, Desc.Offset));
AI->replaceAllUsesWith(NewAllocaPtr);
NewAllocaPtrs.push_back(NewAllocaPtr);
}
// The left-most redzone has enough space for at least 4 pointers.
// Write the Magic value to redzone[0].
- Value *BasePlus0 = IRB.CreateIntToPtr(LocalStackBase, IntptrPtrTy);
IRB.CreateStore(ConstantInt::get(IntptrTy, kCurrentStackFrameMagic),
- BasePlus0);
+ LocalStackBase);
// Write the frame description constant to redzone[1].
- Value *BasePlus1 = IRB.CreateIntToPtr(
- IRB.CreateAdd(LocalStackBase,
- ConstantInt::get(IntptrTy, ASan.LongSize / 8)),
- IntptrPtrTy);
+ Value *BasePlus1 = IRB.CreatePtrAdd(
+ LocalStackBase, ConstantInt::get(IntptrTy, ASan.LongSize / 8));
GlobalVariable *StackDescriptionGlobal =
createPrivateGlobalForString(*F.getParent(), DescriptionString,
/*AllowMerging*/ true, genName("stack"));
Value *Description = IRB.CreatePointerCast(StackDescriptionGlobal, IntptrTy);
IRB.CreateStore(Description, BasePlus1);
// Write the PC to redzone[2].
- Value *BasePlus2 = IRB.CreateIntToPtr(
- IRB.CreateAdd(LocalStackBase,
- ConstantInt::get(IntptrTy, 2 * ASan.LongSize / 8)),
- IntptrPtrTy);
+ Value *BasePlus2 = IRB.CreatePtrAdd(
+ LocalStackBase, ConstantInt::get(IntptrTy, 2 * ASan.LongSize / 8));
IRB.CreateStore(IRB.CreatePointerCast(&F, IntptrTy), BasePlus2);
const auto &ShadowAfterScope = GetShadowBytesAfterScope(SVD, L);
// Poison the stack red zones at the entry.
- Value *ShadowBase = ASan.memToShadow(LocalStackBase, IRB);
+ Value *ShadowBase =
+ ASan.memToShadow(IRB.CreatePtrToInt(LocalStackBase, IntptrTy), IRB);
// As mask we must use most poisoned case: red zones and after scope.
// As bytes we can use either the same or just red zones only.
copyToShadow(ShadowAfterScope, ShadowAfterScope, IRB, ShadowBase);
@@ -3723,7 +3728,7 @@ void FunctionStackPoisoner::processStaticAllocas() {
IRBuilder<> IRBRet(Ret);
// Mark the current frame as retired.
IRBRet.CreateStore(ConstantInt::get(IntptrTy, kRetiredStackFrameMagic),
- BasePlus0);
+ LocalStackBase);
if (DoStackMalloc) {
assert(StackMallocIdx >= 0);
// if FakeStack != 0 // LocalStackBase == FakeStack
@@ -3737,7 +3742,7 @@ void FunctionStackPoisoner::processStaticAllocas() {
// else
// <This is not a fake stack; unpoison the redzones>
Value *Cmp =
- IRBRet.CreateICmpNE(FakeStack, Constant::getNullValue(IntptrTy));
+ IRBRet.CreateICmpNE(FakeStackInt, Constant::getNullValue(IntptrTy));
Instruction *ThenTerm, *ElseTerm;
SplitBlockAndInsertIfThenElse(Cmp, Ret, &ThenTerm, &ElseTerm);
@@ -3748,11 +3753,10 @@ void FunctionStackPoisoner::processStaticAllocas() {
kAsanStackUseAfterReturnMagic);
copyToShadow(ShadowAfterReturn, ShadowAfterReturn, IRBPoison,
ShadowBase);
- Value *SavedFlagPtrPtr = IRBPoison.CreateAdd(
- FakeStack,
+ Value *SavedFlagPtrPtr = IRBPoison.CreatePtrAdd(
+ FakeStackPtr,
ConstantInt::get(IntptrTy, ClassSize - ASan.LongSize / 8));
- Value *SavedFlagPtr = IRBPoison.CreateLoad(
- IntptrTy, IRBPoison.CreateIntToPtr(SavedFlagPtrPtr, IntptrPtrTy));
+ Value *SavedFlagPtr = IRBPoison.CreateLoad(IntptrTy, SavedFlagPtrPtr);
IRBPoison.CreateStore(
Constant::getNullValue(IRBPoison.getInt8Ty()),
IRBPoison.CreateIntToPtr(SavedFlagPtr, IRBPoison.getPtrTy()));
@@ -3760,7 +3764,7 @@ void FunctionStackPoisoner::processStaticAllocas() {
// For larger frames call __asan_stack_free_*.
RTCI.createRuntimeCall(
IRBPoison, AsanStackFreeFunc[StackMallocIdx],
- {FakeStack, ConstantInt::get(IntptrTy, LocalStackSize)});
+ {FakeStackInt, ConstantInt::get(IntptrTy, LocalStackSize)});
}
IRBuilder<> IRBElse(ElseTerm);
diff --git a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
index 7c78eb3..72e8e50 100644
--- a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
+++ b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
@@ -396,9 +396,8 @@ class CHR {
} // end anonymous namespace
-static inline
-raw_ostream LLVM_ATTRIBUTE_UNUSED &operator<<(raw_ostream &OS,
- const CHRStats &Stats) {
+[[maybe_unused]] static inline raw_ostream &operator<<(raw_ostream &OS,
+ const CHRStats &Stats) {
Stats.print(OS);
return OS;
}
@@ -425,8 +424,8 @@ static bool shouldApply(Function &F, ProfileSummaryInfo &PSI) {
return PSI.isFunctionEntryHot(&F);
}
-static void LLVM_ATTRIBUTE_UNUSED dumpIR(Function &F, const char *Label,
- CHRStats *Stats) {
+[[maybe_unused]] static void dumpIR(Function &F, const char *Label,
+ CHRStats *Stats) {
StringRef FuncName = F.getName();
StringRef ModuleName = F.getParent()->getName();
(void)(FuncName); // Unused in release build.
@@ -1622,7 +1621,7 @@ static void insertTrivialPHIs(CHRScope *Scope,
}
// Assert that all the CHR regions of the scope have a biased branch or select.
-static void LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]] static void
assertCHRRegionsHaveBiasedBranchOrSelect(CHRScope *Scope) {
#ifndef NDEBUG
auto HasBiasedBranchOrSelect = [](RegInfo &RI, CHRScope *Scope) {
@@ -1644,8 +1643,9 @@ assertCHRRegionsHaveBiasedBranchOrSelect(CHRScope *Scope) {
// Assert that all the condition values of the biased branches and selects have
// been hoisted to the pre-entry block or outside of the scope.
-static void LLVM_ATTRIBUTE_UNUSED assertBranchOrSelectConditionHoisted(
- CHRScope *Scope, BasicBlock *PreEntryBlock) {
+[[maybe_unused]] static void
+assertBranchOrSelectConditionHoisted(CHRScope *Scope,
+ BasicBlock *PreEntryBlock) {
CHR_DEBUG(dbgs() << "Biased regions condition values \n");
for (RegInfo &RI : Scope->CHRRegions) {
Region *R = RI.R;
@@ -2007,8 +2007,8 @@ void CHR::transformScopes(SmallVectorImpl<CHRScope *> &CHRScopes) {
}
}
-static void LLVM_ATTRIBUTE_UNUSED
-dumpScopes(SmallVectorImpl<CHRScope *> &Scopes, const char *Label) {
+[[maybe_unused]] static void dumpScopes(SmallVectorImpl<CHRScope *> &Scopes,
+ const char *Label) {
dbgs() << Label << " " << Scopes.size() << "\n";
for (CHRScope *Scope : Scopes) {
dbgs() << *Scope << "\n";
@@ -2092,8 +2092,6 @@ bool CHR::run() {
return Changed;
}
-namespace llvm {
-
ControlHeightReductionPass::ControlHeightReductionPass() {
parseCHRFilterFiles();
}
@@ -2116,5 +2114,3 @@ PreservedAnalyses ControlHeightReductionPass::run(
return PreservedAnalyses::all();
return PreservedAnalyses::none();
}
-
-} // namespace llvm
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
index c86092b..a6ec6c1 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
@@ -17,6 +17,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/MemoryProfileInfo.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/StaticDataProfileInfo.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Function.h"
@@ -194,6 +195,30 @@ static bool isAllocationWithHotColdVariant(const Function *Callee,
}
}
+static void HandleUnsupportedAnnotationKinds(GlobalVariable &GVar,
+ AnnotationKind Kind) {
+ assert(Kind != llvm::memprof::AnnotationKind::AnnotationOK &&
+ "Should not handle AnnotationOK here");
+ SmallString<32> Reason;
+ switch (Kind) {
+ case llvm::memprof::AnnotationKind::ExplicitSection:
+ ++NumOfMemProfExplicitSectionGlobalVars;
+ Reason.append("explicit section name");
+ break;
+ case llvm::memprof::AnnotationKind::DeclForLinker:
+ Reason.append("linker declaration");
+ break;
+ case llvm::memprof::AnnotationKind::ReservedName:
+ Reason.append("name starts with `llvm.`");
+ break;
+ default:
+ llvm_unreachable("Unexpected annotation kind");
+ }
+ LLVM_DEBUG(dbgs() << "Skip annotation for " << GVar.getName() << " due to "
+ << Reason << ".\n");
+ return;
+}
+
struct AllocMatchInfo {
uint64_t TotalSize = 0;
AllocationType AllocType = AllocationType::None;
@@ -775,29 +800,13 @@ PreservedAnalyses MemProfUsePass::run(Module &M, ModuleAnalysisManager &AM) {
return PreservedAnalyses::none();
}
-// Returns true iff the global variable has custom section either by
-// __attribute__((section("name")))
-// (https://clang.llvm.org/docs/AttributeReference.html#section-declspec-allocate)
-// or #pragma clang section directives
-// (https://clang.llvm.org/docs/LanguageExtensions.html#specifying-section-names-for-global-objects-pragma-clang-section).
-static bool hasExplicitSectionName(const GlobalVariable &GVar) {
- if (GVar.hasSection())
- return true;
-
- auto Attrs = GVar.getAttributes();
- if (Attrs.hasAttribute("bss-section") || Attrs.hasAttribute("data-section") ||
- Attrs.hasAttribute("relro-section") ||
- Attrs.hasAttribute("rodata-section"))
- return true;
- return false;
-}
-
bool MemProfUsePass::annotateGlobalVariables(
Module &M, const memprof::DataAccessProfData *DataAccessProf) {
if (!AnnotateStaticDataSectionPrefix || M.globals().empty())
return false;
if (!DataAccessProf) {
+ M.addModuleFlag(Module::Warning, "EnableDataAccessProf", 0U);
M.getContext().diagnose(DiagnosticInfoPGOProfile(
MemoryProfileFileName.data(),
StringRef("Data access profiles not found in memprof. Ignore "
@@ -805,6 +814,7 @@ bool MemProfUsePass::annotateGlobalVariables(
DS_Warning));
return false;
}
+ M.addModuleFlag(Module::Warning, "EnableDataAccessProf", 1U);
bool Changed = false;
// Iterate all global variables in the module and annotate them based on
@@ -815,13 +825,9 @@ bool MemProfUsePass::annotateGlobalVariables(
for (GlobalVariable &GVar : M.globals()) {
assert(!GVar.getSectionPrefix().has_value() &&
"GVar shouldn't have section prefix yet");
- if (GVar.isDeclarationForLinker())
- continue;
-
- if (hasExplicitSectionName(GVar)) {
- ++NumOfMemProfExplicitSectionGlobalVars;
- LLVM_DEBUG(dbgs() << "Global variable " << GVar.getName()
- << " has explicit section name. Skip annotating.\n");
+ auto Kind = llvm::memprof::getAnnotationKind(GVar);
+ if (Kind != llvm::memprof::AnnotationKind::AnnotationOK) {
+ HandleUnsupportedAnnotationKinds(GVar, Kind);
continue;
}
@@ -831,7 +837,6 @@ bool MemProfUsePass::annotateGlobalVariables(
// TODO: Track string content hash in the profiles and compute it inside the
// compiler to categeorize the hotness string literals.
if (Name.starts_with(".str")) {
-
LLVM_DEBUG(dbgs() << "Skip annotating string literal " << Name << "\n");
continue;
}
diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
index 66a2c76..386e48f 100644
--- a/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
+++ b/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
@@ -326,8 +326,7 @@ const unsigned BBState::OverflowOccurredValue = 0xffffffff;
namespace llvm {
-raw_ostream &operator<<(raw_ostream &OS,
- BBState &BBState) LLVM_ATTRIBUTE_UNUSED;
+[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS, BBState &BBState);
} // end namespace llvm
@@ -2626,7 +2625,7 @@ void ObjCARCOpt::OptimizeAutoreleasePools(Function &F) {
case ARCInstKind::Call:
if (!MayAutorelease(cast<CallBase>(Inst)))
break;
- LLVM_FALLTHROUGH;
+ [[fallthrough]];
case ARCInstKind::Autorelease:
case ARCInstKind::AutoreleaseRV:
case ARCInstKind::FusedRetainAutorelease:
diff --git a/llvm/lib/Transforms/ObjCARC/PtrState.h b/llvm/lib/Transforms/ObjCARC/PtrState.h
index 232db2b..5cc4212 100644
--- a/llvm/lib/Transforms/ObjCARC/PtrState.h
+++ b/llvm/lib/Transforms/ObjCARC/PtrState.h
@@ -47,8 +47,7 @@ enum Sequence {
S_MovableRelease ///< objc_release(x), !clang.imprecise_release.
};
-raw_ostream &operator<<(raw_ostream &OS,
- const Sequence S) LLVM_ATTRIBUTE_UNUSED;
+[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS, const Sequence S);
/// Unidirectional information about either a
/// retain-decrement-use-release sequence or release-use-decrement-retain
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index ff5f390..66e45ec 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -266,8 +266,7 @@ void DFAJumpThreading::unfold(DomTreeUpdater *DTU, LoopInfo *LI,
if (!ProfcheckDisableMetadataFixes)
BI->setMetadata(LLVMContext::MD_prof,
SI->getMetadata(LLVMContext::MD_prof));
- DTU->applyUpdates({{DominatorTree::Insert, StartBlock, EndBlock},
- {DominatorTree::Insert, StartBlock, NewBlock}});
+ DTU->applyUpdates({{DominatorTree::Insert, StartBlock, NewBlock}});
} else {
BasicBlock *EndBlock = SIUse->getParent();
BasicBlock *NewBlockT = BasicBlock::Create(
@@ -1479,10 +1478,13 @@ bool DFAJumpThreading::run(Function &F) {
DTU->flush();
#ifdef EXPENSIVE_CHECKS
- assert(DTU->getDomTree().verify(DominatorTree::VerificationLevel::Full));
verifyFunction(F, &dbgs());
#endif
+ if (MadeChanges && VerifyDomInfo)
+ assert(DTU->getDomTree().verify(DominatorTree::VerificationLevel::Full) &&
+ "Failed to maintain validity of domtree!");
+
return MadeChanges;
}
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 7ad710d..6141b6d 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -77,6 +77,7 @@
#include "llvm/Support/DebugCounter.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
#include "llvm/Transforms/Utils/BuildLibCalls.h"
#include "llvm/Transforms/Utils/Local.h"
@@ -805,9 +806,8 @@ tryToMergePartialOverlappingStores(StoreInst *KillingI, StoreInst *DeadI,
return nullptr;
}
-namespace {
// Returns true if \p I is an intrinsic that does not read or write memory.
-bool isNoopIntrinsic(Instruction *I) {
+static bool isNoopIntrinsic(Instruction *I) {
if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
switch (II->getIntrinsicID()) {
case Intrinsic::lifetime_start:
@@ -828,7 +828,7 @@ bool isNoopIntrinsic(Instruction *I) {
}
// Check if we can ignore \p D for DSE.
-bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller) {
+static bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller) {
Instruction *DI = D->getMemoryInst();
// Calls that only access inaccessible memory cannot read or write any memory
// locations we consider for elimination.
@@ -856,6 +856,8 @@ bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller) {
return false;
}
+namespace {
+
// A memory location wrapper that represents a MemoryLocation, `MemLoc`,
// defined by `MemDef`.
struct MemoryLocationWrapper {
@@ -889,23 +891,25 @@ struct MemoryDefWrapper {
SmallVector<MemoryLocationWrapper, 1> DefinedLocations;
};
-bool hasInitializesAttr(Instruction *I) {
- CallBase *CB = dyn_cast<CallBase>(I);
- return CB && CB->getArgOperandWithAttribute(Attribute::Initializes);
-}
-
struct ArgumentInitInfo {
unsigned Idx;
bool IsDeadOrInvisibleOnUnwind;
ConstantRangeList Inits;
};
+} // namespace
+
+static bool hasInitializesAttr(Instruction *I) {
+ CallBase *CB = dyn_cast<CallBase>(I);
+ return CB && CB->getArgOperandWithAttribute(Attribute::Initializes);
+}
// Return the intersected range list of the initializes attributes of "Args".
// "Args" are call arguments that alias to each other.
// If any argument in "Args" doesn't have dead_on_unwind attr and
// "CallHasNoUnwindAttr" is false, return empty.
-ConstantRangeList getIntersectedInitRangeList(ArrayRef<ArgumentInitInfo> Args,
- bool CallHasNoUnwindAttr) {
+static ConstantRangeList
+getIntersectedInitRangeList(ArrayRef<ArgumentInitInfo> Args,
+ bool CallHasNoUnwindAttr) {
if (Args.empty())
return {};
@@ -925,6 +929,8 @@ ConstantRangeList getIntersectedInitRangeList(ArrayRef<ArgumentInitInfo> Args,
return IntersectedIntervals;
}
+namespace {
+
struct DSEState {
Function &F;
AliasAnalysis &AA;
@@ -2328,10 +2334,11 @@ struct DSEState {
// change state: whether make any change.
bool eliminateDeadDefs(const MemoryDefWrapper &KillingDefWrapper);
};
+} // namespace
// Return true if "Arg" is function local and isn't captured before "CB".
-bool isFuncLocalAndNotCaptured(Value *Arg, const CallBase *CB,
- EarliestEscapeAnalysis &EA) {
+static bool isFuncLocalAndNotCaptured(Value *Arg, const CallBase *CB,
+ EarliestEscapeAnalysis &EA) {
const Value *UnderlyingObj = getUnderlyingObject(Arg);
return isIdentifiedFunctionLocal(UnderlyingObj) &&
capturesNothing(
@@ -2627,7 +2634,6 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
return MadeChange;
}
-} // end anonymous namespace
//===----------------------------------------------------------------------===//
// DSE Pass
@@ -2728,8 +2734,6 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_END(DSELegacyPass, "dse", "Dead Store Elimination", false,
false)
-namespace llvm {
-LLVM_ABI FunctionPass *createDeadStoreEliminationPass() {
+LLVM_ABI FunctionPass *llvm::createDeadStoreEliminationPass() {
return new DSELegacyPass();
}
-} // namespace llvm
diff --git a/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp b/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
index 213d0f3..1335665 100644
--- a/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
+++ b/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
@@ -39,10 +39,11 @@ public:
private:
AliasAnalysis *AA;
};
+} // namespace
/// iterativelyFlattenCFG - Call FlattenCFG on all the blocks in the function,
/// iterating until no more changes are made.
-bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) {
+static bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) {
bool Changed = false;
bool LocalChange = true;
@@ -67,7 +68,6 @@ bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) {
}
return Changed;
}
-} // namespace
char FlattenCFGLegacyPass::ID = 0;
diff --git a/llvm/lib/Transforms/Scalar/GVNSink.cpp b/llvm/lib/Transforms/Scalar/GVNSink.cpp
index 1c88532..b9534def 100644
--- a/llvm/lib/Transforms/Scalar/GVNSink.cpp
+++ b/llvm/lib/Transforms/Scalar/GVNSink.cpp
@@ -73,24 +73,17 @@
#include <utility>
using namespace llvm;
+using namespace llvm::GVNExpression;
#define DEBUG_TYPE "gvn-sink"
STATISTIC(NumRemoved, "Number of instructions removed");
-namespace llvm {
-namespace GVNExpression {
-
LLVM_DUMP_METHOD void Expression::dump() const {
print(dbgs());
dbgs() << "\n";
}
-} // end namespace GVNExpression
-} // end namespace llvm
-
-namespace {
-
static bool isMemoryInst(const Instruction *I) {
return isa<LoadInst>(I) || isa<StoreInst>(I) ||
(isa<InvokeInst>(I) && !cast<InvokeInst>(I)->doesNotAccessMemory()) ||
@@ -99,6 +92,8 @@ static bool isMemoryInst(const Instruction *I) {
//===----------------------------------------------------------------------===//
+namespace {
+
/// Candidate solution for sinking. There may be different ways to
/// sink instructions, differing in the number of instructions sunk,
/// the number of predecessors sunk from and the number of PHIs
@@ -125,14 +120,6 @@ struct SinkingInstructionCandidate {
}
};
-#ifndef NDEBUG
-raw_ostream &operator<<(raw_ostream &OS, const SinkingInstructionCandidate &C) {
- OS << "<Candidate Cost=" << C.Cost << " #Blocks=" << C.NumBlocks
- << " #Insts=" << C.NumInstructions << " #PHIs=" << C.NumPHIs << ">";
- return OS;
-}
-#endif
-
//===----------------------------------------------------------------------===//
/// Describes a PHI node that may or may not exist. These track the PHIs
@@ -256,8 +243,18 @@ public:
return Values == Other.Values && Blocks == Other.Blocks;
}
};
+} // namespace
-template <typename ModelledPHI> struct DenseMapInfo {
+#ifndef NDEBUG
+static raw_ostream &operator<<(raw_ostream &OS,
+ const SinkingInstructionCandidate &C) {
+ OS << "<Candidate Cost=" << C.Cost << " #Blocks=" << C.NumBlocks
+ << " #Insts=" << C.NumInstructions << " #PHIs=" << C.NumPHIs << ">";
+ return OS;
+}
+#endif
+
+template <> struct llvm::DenseMapInfo<ModelledPHI> {
static inline ModelledPHI &getEmptyKey() {
static ModelledPHI Dummy = ModelledPHI::createDummy(0);
return Dummy;
@@ -275,7 +272,9 @@ template <typename ModelledPHI> struct DenseMapInfo {
}
};
-using ModelledPHISet = DenseSet<ModelledPHI, DenseMapInfo<ModelledPHI>>;
+using ModelledPHISet = DenseSet<ModelledPHI>;
+
+namespace {
//===----------------------------------------------------------------------===//
// ValueTable
@@ -290,7 +289,7 @@ using ModelledPHISet = DenseSet<ModelledPHI, DenseMapInfo<ModelledPHI>>;
///
/// This class also contains fields for discriminators used when determining
/// equivalence of instructions with sideeffects.
-class InstructionUseExpr : public GVNExpression::BasicExpression {
+class InstructionUseExpr : public BasicExpression {
unsigned MemoryUseOrder = -1;
bool Volatile = false;
ArrayRef<int> ShuffleMask;
@@ -298,7 +297,7 @@ class InstructionUseExpr : public GVNExpression::BasicExpression {
public:
InstructionUseExpr(Instruction *I, ArrayRecycler<Value *> &R,
BumpPtrAllocator &A)
- : GVNExpression::BasicExpression(I->getNumUses()) {
+ : BasicExpression(I->getNumUses()) {
allocateOperands(R, A);
setOpcode(I->getOpcode());
setType(I->getType());
@@ -315,8 +314,8 @@ public:
void setVolatile(bool V) { Volatile = V; }
hash_code getHashValue() const override {
- return hash_combine(GVNExpression::BasicExpression::getHashValue(),
- MemoryUseOrder, Volatile, ShuffleMask);
+ return hash_combine(BasicExpression::getHashValue(), MemoryUseOrder,
+ Volatile, ShuffleMask);
}
template <typename Function> hash_code getHashValue(Function MapFn) {
@@ -332,7 +331,7 @@ using BasicBlocksSet = SmallPtrSet<const BasicBlock *, 32>;
class ValueTable {
DenseMap<Value *, uint32_t> ValueNumbering;
- DenseMap<GVNExpression::Expression *, uint32_t> ExpressionNumbering;
+ DenseMap<Expression *, uint32_t> ExpressionNumbering;
DenseMap<size_t, uint32_t> HashNumbering;
BumpPtrAllocator Allocator;
ArrayRecycler<Value *> Recycler;
@@ -594,6 +593,7 @@ private:
}
}
};
+} // namespace
std::optional<SinkingInstructionCandidate>
GVNSink::analyzeInstructionForSinking(LockstepReverseIterator<false> &LRI,
@@ -851,8 +851,6 @@ void GVNSink::sinkLastInstruction(ArrayRef<BasicBlock *> Blocks,
NumRemoved += Insts.size() - 1;
}
-} // end anonymous namespace
-
PreservedAnalyses GVNSinkPass::run(Function &F, FunctionAnalysisManager &AM) {
GVNSink G;
if (!G.run(F))
diff --git a/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/llvm/lib/Transforms/Scalar/GuardWidening.cpp
index d99f1eb..ddb99a5 100644
--- a/llvm/lib/Transforms/Scalar/GuardWidening.cpp
+++ b/llvm/lib/Transforms/Scalar/GuardWidening.cpp
@@ -75,8 +75,6 @@ static cl::opt<bool>
"expressed as branches by widenable conditions"),
cl::init(true));
-namespace {
-
// Get the condition of \p I. It can either be a guard or a conditional branch.
static Value *getCondition(Instruction *I) {
if (IntrinsicInst *GI = dyn_cast<IntrinsicInst>(I)) {
@@ -130,6 +128,8 @@ findInsertionPointForWideCondition(Instruction *WCOrGuard) {
return std::nullopt;
}
+namespace {
+
class GuardWideningImpl {
DominatorTree &DT;
PostDominatorTree *PDT;
@@ -328,7 +328,7 @@ public:
/// The entry point for this pass.
bool run();
};
-}
+} // namespace
static bool isSupportedGuardInstruction(const Instruction *Insn) {
if (isGuard(Insn))
diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index c327311..7ebcc21 100644
--- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -53,6 +53,7 @@
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
@@ -117,6 +118,10 @@ static cl::opt<bool>
LoopPredication("indvars-predicate-loops", cl::Hidden, cl::init(true),
cl::desc("Predicate conditions in read only loops"));
+static cl::opt<bool> LoopPredicationTraps(
+ "indvars-predicate-loop-traps", cl::Hidden, cl::init(true),
+ cl::desc("Predicate conditions that trap in loops with only local writes"));
+
static cl::opt<bool>
AllowIVWidening("indvars-widen-indvars", cl::Hidden, cl::init(true),
cl::desc("Allow widening of indvars to eliminate s/zext"));
@@ -1704,6 +1709,24 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
return Changed;
}
+static bool crashingBBWithoutEffect(const BasicBlock &BB) {
+ return llvm::all_of(BB, [](const Instruction &I) {
+ // TODO: for now this is overly restrictive, to make sure nothing in this
+ // BB can depend on the loop body.
+ // It's not enough to check for !I.mayHaveSideEffects(), because e.g. a
+ // load does not have a side effect, but we could have
+ // %a = load ptr, ptr %ptr
+ // %b = load i32, ptr %a
+ // Now if the loop stored a non-nullptr to %a, we could cause a nullptr
+ // dereference by skipping over loop iterations.
+ if (const auto *CB = dyn_cast<CallBase>(&I)) {
+ if (CB->onlyAccessesInaccessibleMemory())
+ return true;
+ }
+ return isa<UnreachableInst>(I);
+ });
+}
+
bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
SmallVector<BasicBlock*, 16> ExitingBlocks;
L->getExitingBlocks(ExitingBlocks);
@@ -1816,11 +1839,25 @@ bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
// suggestions on how to improve this? I can obviously bail out for outer
// loops, but that seems less than ideal. MemorySSA can find memory writes,
// is that enough for *all* side effects?
+ bool HasThreadLocalSideEffects = false;
for (BasicBlock *BB : L->blocks())
for (auto &I : *BB)
// TODO:isGuaranteedToTransfer
- if (I.mayHaveSideEffects())
- return false;
+ if (I.mayHaveSideEffects()) {
+ if (!LoopPredicationTraps)
+ return false;
+ HasThreadLocalSideEffects = true;
+ if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
+ // Simple stores cannot be observed by other threads.
+ // If HasThreadLocalSideEffects is set, we check
+ // crashingBBWithoutEffect to make sure that the crashing BB cannot
+ // observe them either.
+ if (!SI->isSimple())
+ return false;
+ } else {
+ return false;
+ }
+ }
bool Changed = false;
// Finally, do the actual predication for all predicatable blocks. A couple
@@ -1840,6 +1877,19 @@ bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
auto *BI = cast<BranchInst>(ExitingBB->getTerminator());
+ if (HasThreadLocalSideEffects) {
+ const BasicBlock *Unreachable = nullptr;
+ for (const BasicBlock *Succ : BI->successors()) {
+ if (isa<UnreachableInst>(Succ->getTerminator()))
+ Unreachable = Succ;
+ }
+ // Exit BB which have one branch back into the loop and another one to
+ // a trap can still be optimized, because local side effects cannot
+ // be observed in the exit case (the trap). We could be smarter about
+ // this, but for now lets pattern match common cases that directly trap.
+ if (Unreachable == nullptr || !crashingBBWithoutEffect(*Unreachable))
+ return Changed;
+ }
Value *NewCond;
if (ExitCount == ExactBTC) {
NewCond = L->contains(BI->getSuccessor(0)) ?
diff --git a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
index 3c14036e..6fb8197 100644
--- a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
@@ -26,8 +26,6 @@
using namespace llvm;
-namespace llvm {
-
static cl::opt<unsigned>
JumpTableSizeThreshold("jump-table-to-switch-size-threshold", cl::Hidden,
cl::desc("Only split jump tables with size less or "
@@ -43,8 +41,8 @@ static cl::opt<unsigned> FunctionSizeThreshold(
"or equal than this threshold."),
cl::init(50));
+namespace llvm {
extern cl::opt<bool> ProfcheckDisableMetadataFixes;
-
} // end namespace llvm
#define DEBUG_TYPE "jump-table-to-switch"
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 9655173..b2c526b 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -116,8 +116,6 @@ STATISTIC(NumIntAssociationsHoisted,
STATISTIC(NumBOAssociationsHoisted, "Number of invariant BinaryOp expressions "
"reassociated and hoisted out of the loop");
-namespace llvm {
-
/// Memory promotion is enabled by default.
static cl::opt<bool>
DisablePromotion("disable-licm-promotion", cl::Hidden, cl::init(false),
@@ -156,7 +154,7 @@ static cl::opt<unsigned> IntAssociationUpperLimit(
// which may not be precise, since optimizeUses is capped. The result is
// correct, but we may not get as "far up" as possible to get which access is
// clobbering the one queried.
-cl::opt<unsigned> SetLicmMssaOptCap(
+cl::opt<unsigned> llvm::SetLicmMssaOptCap(
"licm-mssa-optimization-cap", cl::init(100), cl::Hidden,
cl::desc("Enable imprecision in LICM in pathological cases, in exchange "
"for faster compile. Caps the MemorySSA clobbering calls."));
@@ -164,15 +162,15 @@ cl::opt<unsigned> SetLicmMssaOptCap(
// Experimentally, memory promotion carries less importance than sinking and
// hoisting. Limit when we do promotion when using MemorySSA, in order to save
// compile time.
-cl::opt<unsigned> SetLicmMssaNoAccForPromotionCap(
+cl::opt<unsigned> llvm::SetLicmMssaNoAccForPromotionCap(
"licm-mssa-max-acc-promotion", cl::init(250), cl::Hidden,
cl::desc("[LICM & MemorySSA] When MSSA in LICM is disabled, this has no "
"effect. When MSSA in LICM is enabled, then this is the maximum "
"number of accesses allowed to be present in a loop in order to "
"enable memory promotion."));
+namespace llvm {
extern cl::opt<bool> ProfcheckDisableMetadataFixes;
-
} // end namespace llvm
static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
@@ -1120,11 +1118,10 @@ static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT,
return false;
}
-namespace {
/// Return true if-and-only-if we know how to (mechanically) both hoist and
/// sink a given instruction out of a loop. Does not address legality
/// concerns such as aliasing or speculation safety.
-bool isHoistableAndSinkableInst(Instruction &I) {
+static bool isHoistableAndSinkableInst(Instruction &I) {
// Only these instructions are hoistable/sinkable.
return (isa<LoadInst>(I) || isa<StoreInst>(I) || isa<CallInst>(I) ||
isa<FenceInst>(I) || isa<CastInst>(I) || isa<UnaryOperator>(I) ||
@@ -1136,8 +1133,8 @@ bool isHoistableAndSinkableInst(Instruction &I) {
}
/// Return true if I is the only Instruction with a MemoryAccess in L.
-bool isOnlyMemoryAccess(const Instruction *I, const Loop *L,
- const MemorySSAUpdater &MSSAU) {
+static bool isOnlyMemoryAccess(const Instruction *I, const Loop *L,
+ const MemorySSAUpdater &MSSAU) {
for (auto *BB : L->getBlocks())
if (auto *Accs = MSSAU.getMemorySSA()->getBlockAccesses(BB)) {
int NotAPhi = 0;
@@ -1151,7 +1148,6 @@ bool isOnlyMemoryAccess(const Instruction *I, const Loop *L,
}
return true;
}
-}
static MemoryAccess *getClobberingMemoryAccess(MemorySSA &MSSA,
BatchAAResults &BAA,
diff --git a/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp b/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp
index 73f1942..7706de8 100644
--- a/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp
@@ -21,8 +21,7 @@
#define DEBUG_TYPE "loop-bound-split"
-namespace llvm {
-
+using namespace llvm;
using namespace PatternMatch;
namespace {
@@ -358,8 +357,7 @@ static bool splitLoopBound(Loop &L, DominatorTree &DT, LoopInfo &LI,
IRBuilder<> Builder(&PostLoopPreHeader->front());
// Update phi nodes in header of post-loop.
- bool isExitingLatch =
- (L.getExitingBlock() == L.getLoopLatch()) ? true : false;
+ bool isExitingLatch = L.getExitingBlock() == L.getLoopLatch();
Value *ExitingCondLCSSAPhi = nullptr;
for (PHINode &PN : L.getHeader()->phis()) {
// Create LCSSA phi node in preheader of post-loop.
@@ -472,8 +470,7 @@ static bool splitLoopBound(Loop &L, DominatorTree &DT, LoopInfo &LI,
PreservedAnalyses LoopBoundSplitPass::run(Loop &L, LoopAnalysisManager &AM,
LoopStandardAnalysisResults &AR,
LPMUpdater &U) {
- Function &F = *L.getHeader()->getParent();
- (void)F;
+ [[maybe_unused]] Function &F = *L.getHeader()->getParent();
LLVM_DEBUG(dbgs() << "Spliting bound of loop in " << F.getName() << ": " << L
<< "\n");
@@ -486,5 +483,3 @@ PreservedAnalyses LoopBoundSplitPass::run(Loop &L, LoopAnalysisManager &AM,
return getLoopPassPreservedAnalyses();
}
-
-} // end namespace llvm
diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
index 20733032..19eccb9 100644
--- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
@@ -368,7 +368,7 @@ private:
Valid = false;
}
- bool reportInvalidCandidate(llvm::Statistic &Stat) const {
+ bool reportInvalidCandidate(Statistic &Stat) const {
using namespace ore;
assert(L && Preheader && "Fusion candidate not initialized properly!");
#if LLVM_ENABLE_STATS
@@ -445,6 +445,7 @@ struct FusionCandidateCompare {
"No dominance relationship between these fusion candidates!");
}
};
+} // namespace
using LoopVector = SmallVector<Loop *, 4>;
@@ -461,9 +462,15 @@ using LoopVector = SmallVector<Loop *, 4>;
using FusionCandidateSet = std::set<FusionCandidate, FusionCandidateCompare>;
using FusionCandidateCollection = SmallVector<FusionCandidateSet, 4>;
-#if !defined(NDEBUG)
-static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
- const FusionCandidate &FC) {
+#ifndef NDEBUG
+static void printLoopVector(const LoopVector &LV) {
+ dbgs() << "****************************\n";
+ for (const Loop *L : LV)
+ printLoop(*L, dbgs());
+ dbgs() << "****************************\n";
+}
+
+static raw_ostream &operator<<(raw_ostream &OS, const FusionCandidate &FC) {
if (FC.isValid())
OS << FC.Preheader->getName();
else
@@ -472,8 +479,8 @@ static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
return OS;
}
-static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
- const FusionCandidateSet &CandSet) {
+static raw_ostream &operator<<(raw_ostream &OS,
+ const FusionCandidateSet &CandSet) {
for (const FusionCandidate &FC : CandSet)
OS << FC << '\n';
@@ -489,7 +496,9 @@ printFusionCandidates(const FusionCandidateCollection &FusionCandidates) {
dbgs() << "****************************\n";
}
}
-#endif
+#endif // NDEBUG
+
+namespace {
/// Collect all loops in function at the same nest level, starting at the
/// outermost level.
@@ -550,15 +559,6 @@ private:
LoopsOnLevelTy LoopsOnLevel;
};
-#ifndef NDEBUG
-static void printLoopVector(const LoopVector &LV) {
- dbgs() << "****************************\n";
- for (auto *L : LV)
- printLoop(*L, dbgs());
- dbgs() << "****************************\n";
-}
-#endif
-
struct LoopFuser {
private:
// Sets of control flow equivalent fusion candidates for a given nest level.
@@ -1850,7 +1850,7 @@ private:
/// <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description>
template <typename RemarkKind>
void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1,
- llvm::Statistic &Stat) {
+ Statistic &Stat) {
assert(FC0.Preheader && FC1.Preheader &&
"Expecting valid fusion candidates");
using namespace ore;
diff --git a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
index 32078b1..7da8586 100644
--- a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
@@ -16,8 +16,6 @@
using namespace llvm;
-namespace llvm {
-
/// Explicitly specialize the pass manager's run method to handle loop nest
/// structure updates.
PreservedAnalyses
@@ -185,7 +183,6 @@ LoopPassManager::runWithoutLoopNestPasses(Loop &L, LoopAnalysisManager &AM,
}
return PA;
}
-} // namespace llvm
void FunctionToLoopPassAdaptor::printPipeline(
raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
@@ -193,6 +190,7 @@ void FunctionToLoopPassAdaptor::printPipeline(
Pass->printPipeline(OS, MapClassName2PassName);
OS << ')';
}
+
PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F,
FunctionAnalysisManager &AM) {
// Before we even compute any loop analyses, first run a miniature function
diff --git a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
index 448dc2b..f3e6cbf 100644
--- a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
@@ -540,8 +540,6 @@ bool LoopVersioningLICM::run(DominatorTree *DT) {
return Changed;
}
-namespace llvm {
-
PreservedAnalyses LoopVersioningLICMPass::run(Loop &L, LoopAnalysisManager &AM,
LoopStandardAnalysisResults &LAR,
LPMUpdater &U) {
@@ -556,4 +554,3 @@ PreservedAnalyses LoopVersioningLICMPass::run(Loop &L, LoopAnalysisManager &AM,
return PreservedAnalyses::all();
return getLoopPassPreservedAnalyses();
}
-} // namespace llvm
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 56e0569..3487e81 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -97,6 +97,12 @@ static cl::opt<MatrixLayoutTy> MatrixLayout(
static cl::opt<bool> PrintAfterTransposeOpt("matrix-print-after-transpose-opt",
cl::init(false));
+static cl::opt<unsigned> SplitMatmulRemainderOverThreshold(
+ "matrix-split-matmul-remainder-over-threshold", cl::Hidden,
+ cl::desc("Illegal remainder vectors over this size in bits should be split "
+ "in the inner loop of matmul"),
+ cl::init(0));
+
/// Helper function to either return Scope, if it is a subprogram or the
/// attached subprogram for a local scope.
static DISubprogram *getSubprogram(DIScope *Scope) {
@@ -115,18 +121,16 @@ static bool isSplat(Value *V) {
/// Match any mul operation (fp or integer).
template <typename LTy, typename RTy>
-auto m_AnyMul(const LTy &L, const RTy &R) {
+static auto m_AnyMul(const LTy &L, const RTy &R) {
return m_CombineOr(m_Mul(L, R), m_FMul(L, R));
}
/// Match any add operation (fp or integer).
template <typename LTy, typename RTy>
-auto m_AnyAdd(const LTy &L, const RTy &R) {
+static auto m_AnyAdd(const LTy &L, const RTy &R) {
return m_CombineOr(m_Add(L, R), m_FAdd(L, R));
}
-namespace {
-
// Given an element pointer \p BasePtr to the start of a (sub) matrix, compute
// the start address of vector \p VecIdx with type (\p EltType x \p NumElements)
// assuming \p Stride elements between start two consecutive vectors.
@@ -167,9 +171,9 @@ namespace {
// v_2_0 |v_2_1 |v_2_2 |v_2_3
// v_3_0 {v_3_1 {v_3_2 v_3_3
//
-Value *computeVectorAddr(Value *BasePtr, Value *VecIdx, Value *Stride,
- unsigned NumElements, Type *EltType,
- IRBuilder<> &Builder) {
+static Value *computeVectorAddr(Value *BasePtr, Value *VecIdx, Value *Stride,
+ unsigned NumElements, Type *EltType,
+ IRBuilder<> &Builder) {
assert((!isa<ConstantInt>(Stride) ||
cast<ConstantInt>(Stride)->getZExtValue() >= NumElements) &&
@@ -338,6 +342,8 @@ computeShapeInfoForInst(Instruction *I,
return std::nullopt;
}
+namespace {
+
/// LowerMatrixIntrinsics contains the methods used to lower matrix intrinsics.
///
/// Currently, the lowering for each matrix intrinsic is done as follows:
@@ -371,7 +377,8 @@ class LowerMatrixIntrinsics {
LoopInfo *LI = nullptr;
OptimizationRemarkEmitter *ORE = nullptr;
- /// Contains estimates of the number of operations (loads, stores, compute) required to lower a matrix operation.
+ /// Contains estimates of the number of operations (loads, stores, compute)
+ /// required to lower a matrix operation.
struct OpInfoTy {
/// Number of stores emitted to generate this matrix.
unsigned NumStores = 0;
@@ -1295,6 +1302,24 @@ public:
return commonAlignment(InitialAlign, ElementSizeInBits / 8);
}
+ IntegerType *getIndexType(Value *Ptr) const {
+ return cast<IntegerType>(DL.getIndexType(Ptr->getType()));
+ }
+
+ Value *getIndex(Value *Ptr, uint64_t V) const {
+ return ConstantInt::get(getIndexType(Ptr), V);
+ }
+
+ Value *castToIndexType(Value *Ptr, Value *V, IRBuilder<> &Builder) const {
+ assert(isa<IntegerType>(V->getType()) &&
+ "Attempted to cast non-integral type to integer index");
+ // In case the data layout's index type differs in width from the type of
+ // the value we're given, truncate or zero extend to the appropriate width.
+ // We zero extend here as indices are unsigned.
+ return Builder.CreateZExtOrTrunc(V, getIndexType(Ptr),
+ V->getName() + ".cast");
+ }
+
/// Load a matrix with \p Shape starting at \p Ptr and using \p Stride between
/// vectors.
MatrixTy loadMatrix(Type *Ty, Value *Ptr, MaybeAlign MAlign, Value *Stride,
@@ -1304,6 +1329,7 @@ public:
Type *VecTy = FixedVectorType::get(EltTy, Shape.getStride());
Value *EltPtr = Ptr;
MatrixTy Result;
+ Stride = castToIndexType(Ptr, Stride, Builder);
for (unsigned I = 0, E = Shape.getNumVectors(); I < E; ++I) {
Value *GEP = computeVectorAddr(
EltPtr, Builder.getIntN(Stride->getType()->getScalarSizeInBits(), I),
@@ -1325,14 +1351,14 @@ public:
ShapeInfo ResultShape, Type *EltTy,
IRBuilder<> &Builder) {
Value *Offset = Builder.CreateAdd(
- Builder.CreateMul(J, Builder.getInt64(MatrixShape.getStride())), I);
+ Builder.CreateMul(J, getIndex(MatrixPtr, MatrixShape.getStride())), I);
Value *TileStart = Builder.CreateGEP(EltTy, MatrixPtr, Offset);
auto *TileTy = FixedVectorType::get(EltTy, ResultShape.NumRows *
ResultShape.NumColumns);
return loadMatrix(TileTy, TileStart, Align,
- Builder.getInt64(MatrixShape.getStride()), IsVolatile,
+ getIndex(MatrixPtr, MatrixShape.getStride()), IsVolatile,
ResultShape, Builder);
}
@@ -1363,14 +1389,15 @@ public:
MaybeAlign MAlign, bool IsVolatile, ShapeInfo MatrixShape,
Value *I, Value *J, Type *EltTy, IRBuilder<> &Builder) {
Value *Offset = Builder.CreateAdd(
- Builder.CreateMul(J, Builder.getInt64(MatrixShape.getStride())), I);
+ Builder.CreateMul(J, getIndex(MatrixPtr, MatrixShape.getStride())), I);
Value *TileStart = Builder.CreateGEP(EltTy, MatrixPtr, Offset);
auto *TileTy = FixedVectorType::get(EltTy, StoreVal.getNumRows() *
StoreVal.getNumColumns());
storeMatrix(TileTy, StoreVal, TileStart, MAlign,
- Builder.getInt64(MatrixShape.getStride()), IsVolatile, Builder);
+ getIndex(MatrixPtr, MatrixShape.getStride()), IsVolatile,
+ Builder);
}
/// Store matrix \p StoreVal starting at \p Ptr and using \p Stride between
@@ -1380,6 +1407,7 @@ public:
IRBuilder<> &Builder) {
auto *VType = cast<FixedVectorType>(Ty);
Value *EltPtr = Ptr;
+ Stride = castToIndexType(Ptr, Stride, Builder);
for (auto Vec : enumerate(StoreVal.vectors())) {
Value *GEP = computeVectorAddr(
EltPtr,
@@ -1698,6 +1726,31 @@ public:
ToRemove.push_back(MatMul);
}
+ /// Given \p Remainder iterations of the the matmul inner loop,
+ /// potentially lower \p Blocksize that is used for the underlying
+ /// vector.
+ unsigned capBlockSize(unsigned BlockSize, unsigned Remainder, Type *EltType) {
+ if (BlockSize <= Remainder)
+ return BlockSize;
+
+ // If the remainder is also a legal type just use it.
+ auto *VecTy = FixedVectorType::get(EltType, Remainder);
+ if (TTI.isTypeLegal(VecTy))
+ return Remainder;
+
+ // Similarly, if the vector is small enough that we don't want
+ // to split further.
+ if (VecTy->getPrimitiveSizeInBits() <= SplitMatmulRemainderOverThreshold)
+ return Remainder;
+
+ // Gradually lower the vectorization factor to cover the
+ // remainder.
+ do {
+ BlockSize /= 2;
+ } while (BlockSize > Remainder);
+ return BlockSize;
+ }
+
/// Compute \p Result += \p A * \p B for input matrices with left-associating
/// addition.
///
@@ -1735,10 +1788,8 @@ public:
bool isSumZero = isa<ConstantAggregateZero>(Result.getColumn(J));
for (unsigned I = 0; I < R; I += BlockSize) {
- // Gradually lower the vectorization factor to cover the remainder.
- while (I + BlockSize > R)
- BlockSize /= 2;
-
+ // Lower block size to make sure we stay within bounds.
+ BlockSize = capBlockSize(BlockSize, R - I, Result.getElementType());
Value *Sum = IsTiled ? Result.extractVector(I, J, BlockSize, Builder)
: nullptr;
for (unsigned K = 0; K < M; ++K) {
@@ -1763,9 +1814,8 @@ public:
unsigned BlockSize = VF;
bool isSumZero = isa<ConstantAggregateZero>(Result.getRow(I));
for (unsigned J = 0; J < C; J += BlockSize) {
- // Gradually lower the vectorization factor to cover the remainder.
- while (J + BlockSize > C)
- BlockSize /= 2;
+ // Lower the vectorization factor to cover the remainder.
+ BlockSize = capBlockSize(BlockSize, C - J, Result.getElementType());
Value *Sum = nullptr;
for (unsigned K = 0; K < M; ++K) {
@@ -2011,18 +2061,17 @@ public:
const unsigned TileM = std::min(M - K, unsigned(TileSize));
MatrixTy A =
loadMatrix(APtr, LoadOp0->getAlign(), LoadOp0->isVolatile(),
- LShape, Builder.getInt64(I), Builder.getInt64(K),
+ LShape, getIndex(APtr, I), getIndex(APtr, K),
{TileR, TileM}, EltType, Builder);
MatrixTy B =
loadMatrix(BPtr, LoadOp1->getAlign(), LoadOp1->isVolatile(),
- RShape, Builder.getInt64(K), Builder.getInt64(J),
+ RShape, getIndex(BPtr, K), getIndex(BPtr, J),
{TileM, TileC}, EltType, Builder);
emitMatrixMultiply(Res, A, B, Builder, true, false,
getFastMathFlags(MatMul));
}
storeMatrix(Res, CPtr, Store->getAlign(), Store->isVolatile(), {R, M},
- Builder.getInt64(I), Builder.getInt64(J), EltType,
- Builder);
+ getIndex(CPtr, I), getIndex(CPtr, J), EltType, Builder);
}
}
@@ -2254,15 +2303,14 @@ public:
/// Lower load instructions.
MatrixTy VisitLoad(LoadInst *Inst, const ShapeInfo &SI, Value *Ptr,
IRBuilder<> &Builder) {
- return LowerLoad(Inst, Ptr, Inst->getAlign(),
- Builder.getInt64(SI.getStride()), Inst->isVolatile(), SI,
- Builder);
+ return LowerLoad(Inst, Ptr, Inst->getAlign(), getIndex(Ptr, SI.getStride()),
+ Inst->isVolatile(), SI, Builder);
}
MatrixTy VisitStore(StoreInst *Inst, const ShapeInfo &SI, Value *StoredVal,
Value *Ptr, IRBuilder<> &Builder) {
return LowerStore(Inst, StoredVal, Ptr, Inst->getAlign(),
- Builder.getInt64(SI.getStride()), Inst->isVolatile(), SI,
+ getIndex(Ptr, SI.getStride()), Inst->isVolatile(), SI,
Builder);
}
diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp
index 80aa98d..5a8f18a 100644
--- a/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -160,9 +160,6 @@ static cl::opt<bool> EnablePhiOfOps("enable-phi-of-ops", cl::init(true),
//===----------------------------------------------------------------------===//
// Anchor methods.
-namespace llvm {
-namespace GVNExpression {
-
Expression::~Expression() = default;
BasicExpression::~BasicExpression() = default;
CallExpression::~CallExpression() = default;
@@ -171,9 +168,6 @@ StoreExpression::~StoreExpression() = default;
AggregateValueExpression::~AggregateValueExpression() = default;
PHIExpression::~PHIExpression() = default;
-} // end namespace GVNExpression
-} // end namespace llvm
-
namespace {
// Tarjan's SCC finding algorithm with Nuutila's improvements
diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp
index ba58b8e..6d7ce36 100644
--- a/llvm/lib/Transforms/Scalar/Reassociate.cpp
+++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -2623,32 +2623,32 @@ PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) {
namespace {
- class ReassociateLegacyPass : public FunctionPass {
- ReassociatePass Impl;
+class ReassociateLegacyPass : public FunctionPass {
+ ReassociatePass Impl;
- public:
- static char ID; // Pass identification, replacement for typeid
+public:
+ static char ID; // Pass identification, replacement for typeid
- ReassociateLegacyPass() : FunctionPass(ID) {
- initializeReassociateLegacyPassPass(*PassRegistry::getPassRegistry());
- }
+ ReassociateLegacyPass() : FunctionPass(ID) {
+ initializeReassociateLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
- bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
- FunctionAnalysisManager DummyFAM;
- auto PA = Impl.run(F, DummyFAM);
- return !PA.areAllPreserved();
- }
+ FunctionAnalysisManager DummyFAM;
+ auto PA = Impl.run(F, DummyFAM);
+ return !PA.areAllPreserved();
+ }
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- AU.addPreserved<AAResultsWrapperPass>();
- AU.addPreserved<BasicAAWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- }
- };
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addPreserved<AAResultsWrapperPass>();
+ AU.addPreserved<BasicAAWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+};
} // end anonymous namespace
diff --git a/llvm/lib/Transforms/Scalar/Reg2Mem.cpp b/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
index 30b27cb..7646624 100644
--- a/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
+++ b/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
@@ -107,9 +107,7 @@ PreservedAnalyses RegToMemPass::run(Function &F, FunctionAnalysisManager &AM) {
return PA;
}
-namespace llvm {
-
-void initializeRegToMemWrapperPassPass(PassRegistry &);
+namespace {
class RegToMemWrapperPass : public FunctionPass {
public:
@@ -136,7 +134,7 @@ public:
return N != 0 || Changed;
}
};
-} // namespace llvm
+} // namespace
INITIALIZE_PASS_BEGIN(RegToMemWrapperPass, "reg2mem", "", true, true)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index b9d332b..5c60fad 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -118,9 +118,13 @@ STATISTIC(
STATISTIC(NumDeleted, "Number of instructions deleted");
STATISTIC(NumVectorized, "Number of vectorized aggregates");
+namespace llvm {
/// Disable running mem2reg during SROA in order to test or debug SROA.
static cl::opt<bool> SROASkipMem2Reg("sroa-skip-mem2reg", cl::init(false),
cl::Hidden);
+extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+} // namespace llvm
+
namespace {
class AllocaSliceRewriter;
@@ -340,6 +344,12 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
uint64_t SliceSizeInBits, Instruction *OldInst,
Instruction *Inst, Value *Dest, Value *Value,
const DataLayout &DL) {
+ // If we want allocas to be migrated using this helper then we need to ensure
+ // that the BaseFragments map code still works. A simple solution would be
+ // to choose to always clone alloca dbg_assigns (rather than sometimes
+ // "stealing" them).
+ assert(!isa<AllocaInst>(Inst) && "Unexpected alloca");
+
auto DVRAssignMarkerRange = at::getDVRAssignmentMarkers(OldInst);
// Nothing to do if OldInst has no linked dbg.assign intrinsics.
if (DVRAssignMarkerRange.empty())
@@ -425,11 +435,22 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
Inst->setMetadata(LLVMContext::MD_DIAssignID, NewID);
}
- ::Value *NewValue = Value ? Value : DbgAssign->getValue();
- DbgVariableRecord *NewAssign = cast<DbgVariableRecord>(cast<DbgRecord *>(
- DIB.insertDbgAssign(Inst, NewValue, DbgAssign->getVariable(), Expr,
- Dest, DIExpression::get(Expr->getContext(), {}),
- DbgAssign->getDebugLoc())));
+ DbgVariableRecord *NewAssign;
+ if (IsSplit) {
+ ::Value *NewValue = Value ? Value : DbgAssign->getValue();
+ NewAssign = cast<DbgVariableRecord>(cast<DbgRecord *>(
+ DIB.insertDbgAssign(Inst, NewValue, DbgAssign->getVariable(), Expr,
+ Dest, DIExpression::get(Expr->getContext(), {}),
+ DbgAssign->getDebugLoc())));
+ } else {
+ // The store is not split, simply steal the existing dbg_assign.
+ NewAssign = DbgAssign;
+ NewAssign->setAssignId(NewID); // FIXME: Can we avoid generating new IDs?
+ NewAssign->setAddress(Dest);
+ if (Value)
+ NewAssign->replaceVariableLocationOp(0u, Value);
+ assert(Expr == NewAssign->getExpression());
+ }
// If we've updated the value but the original dbg.assign has an arglist
// then kill it now - we can't use the requested new value.
@@ -460,9 +481,10 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
// noted as slightly offset (in code) from the store. In practice this
// should have little effect on the debugging experience due to the fact
// that all the split stores should get the same line number.
- NewAssign->moveBefore(DbgAssign->getIterator());
-
- NewAssign->setDebugLoc(DbgAssign->getDebugLoc());
+ if (NewAssign != DbgAssign) {
+ NewAssign->moveBefore(DbgAssign->getIterator());
+ NewAssign->setDebugLoc(DbgAssign->getDebugLoc());
+ }
LLVM_DEBUG(dbgs() << "Created new assign: " << *NewAssign << "\n");
};
@@ -547,12 +569,10 @@ public:
}
/// Support comparison with a single offset to allow binary searches.
- friend LLVM_ATTRIBUTE_UNUSED bool operator<(const Slice &LHS,
- uint64_t RHSOffset) {
+ [[maybe_unused]] friend bool operator<(const Slice &LHS, uint64_t RHSOffset) {
return LHS.beginOffset() < RHSOffset;
}
- friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset,
- const Slice &RHS) {
+ [[maybe_unused]] friend bool operator<(uint64_t LHSOffset, const Slice &RHS) {
return LHSOffset < RHS.beginOffset();
}
@@ -1777,7 +1797,8 @@ static void speculateSelectInstLoads(SelectInst &SI, LoadInst &LI,
}
Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
- LI.getName() + ".sroa.speculated");
+ LI.getName() + ".sroa.speculated",
+ ProfcheckDisableMetadataFixes ? nullptr : &SI);
LLVM_DEBUG(dbgs() << " speculated to: " << *V << "\n");
LI.replaceAllUsesWith(V);
@@ -2662,7 +2683,9 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i)
Mask2.push_back(IRB.getInt1(i >= BeginIndex && i < EndIndex));
- V = IRB.CreateSelect(ConstantVector::get(Mask2), V, Old, Name + "blend");
+ // No profiling support for vector selects.
+ V = IRB.CreateSelectWithUnknownProfile(ConstantVector::get(Mask2), V, Old,
+ DEBUG_TYPE, Name + "blend");
LLVM_DEBUG(dbgs() << " blend: " << *V << "\n");
return V;
@@ -4360,10 +4383,13 @@ private:
};
Value *Cond, *True, *False;
+ Instruction *MDFrom = nullptr;
if (auto *SI = dyn_cast<SelectInst>(Sel)) {
Cond = SI->getCondition();
True = SI->getTrueValue();
False = SI->getFalseValue();
+ if (!ProfcheckDisableMetadataFixes)
+ MDFrom = SI;
} else {
Cond = Sel->getOperand(0);
True = ConstantInt::get(Sel->getType(), 1);
@@ -4383,8 +4409,12 @@ private:
IRB.CreateGEP(Ty, FalseOps[0], ArrayRef(FalseOps).drop_front(),
False->getName() + ".sroa.gep", NW);
- Value *NSel =
- IRB.CreateSelect(Cond, NTrue, NFalse, Sel->getName() + ".sroa.sel");
+ Value *NSel = MDFrom
+ ? IRB.CreateSelect(Cond, NTrue, NFalse,
+ Sel->getName() + ".sroa.sel", MDFrom)
+ : IRB.CreateSelectWithUnknownProfile(
+ Cond, NTrue, NFalse, DEBUG_TYPE,
+ Sel->getName() + ".sroa.sel");
Visited.erase(&GEPI);
GEPI.replaceAllUsesWith(NSel);
GEPI.eraseFromParent();
diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
index aae5d60..25a531c 100644
--- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
@@ -50,9 +50,7 @@ using namespace llvm;
#define DEBUG_TYPE "scalarizer"
-namespace {
-
-BasicBlock::iterator skipPastPhiNodesAndDbg(BasicBlock::iterator Itr) {
+static BasicBlock::iterator skipPastPhiNodesAndDbg(BasicBlock::iterator Itr) {
BasicBlock *BB = Itr->getParent();
if (isa<PHINode>(Itr))
Itr = BB->getFirstInsertionPt();
@@ -76,6 +74,8 @@ using ScatterMap = std::map<std::pair<Value *, Type *>, ValueVector>;
// along with a pointer to their scattered forms.
using GatherList = SmallVector<std::pair<Instruction *, ValueVector *>, 16>;
+namespace {
+
struct VectorSplit {
// The type of the vector.
FixedVectorType *VecTy = nullptr;
@@ -196,6 +196,7 @@ struct VectorLayout {
// The size of each (non-remainder) fragment in bytes.
uint64_t SplitSize = 0;
};
+} // namespace
static bool isStructOfMatchingFixedVectors(Type *Ty) {
if (!isa<StructType>(Ty))
@@ -268,6 +269,7 @@ static Value *concatenate(IRBuilder<> &Builder, ArrayRef<Value *> Fragments,
return Res;
}
+namespace {
class ScalarizerVisitor : public InstVisitor<ScalarizerVisitor, bool> {
public:
ScalarizerVisitor(DominatorTree *DT, const TargetTransformInfo *TTI,
diff --git a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
index ebcbd2b..fa66a03 100644
--- a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
+++ b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -149,8 +149,6 @@ bool SpeculativeExecutionLegacyPass::runOnFunction(Function &F) {
return Impl.runImpl(F, TTI);
}
-namespace llvm {
-
bool SpeculativeExecutionPass::runImpl(Function &F, TargetTransformInfo *TTI) {
if (OnlyIfDivergentTarget && !TTI->hasBranchDivergence(&F)) {
LLVM_DEBUG(dbgs() << "Not running SpeculativeExecution because "
@@ -328,11 +326,11 @@ bool SpeculativeExecutionPass::considerHoistingFromTo(
return true;
}
-FunctionPass *createSpeculativeExecutionPass() {
+FunctionPass *llvm::createSpeculativeExecutionPass() {
return new SpeculativeExecutionLegacyPass();
}
-FunctionPass *createSpeculativeExecutionIfHasBranchDivergencePass() {
+FunctionPass *llvm::createSpeculativeExecutionIfHasBranchDivergencePass() {
return new SpeculativeExecutionLegacyPass(/* OnlyIfDivergentTarget = */ true);
}
@@ -362,4 +360,3 @@ void SpeculativeExecutionPass::printPipeline(
OS << "only-if-divergent-target";
OS << '>';
}
-} // namespace llvm
diff --git a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
index 7d01709..e94ad19 100644
--- a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -716,8 +716,6 @@ bool StraightLineStrengthReduce::runOnFunction(Function &F) {
return Ret;
}
-namespace llvm {
-
PreservedAnalyses
StraightLineStrengthReducePass::run(Function &F, FunctionAnalysisManager &AM) {
const DataLayout *DL = &F.getDataLayout();
@@ -735,5 +733,3 @@ StraightLineStrengthReducePass::run(Function &F, FunctionAnalysisManager &AM) {
PA.preserve<TargetIRAnalysis>();
return PA;
}
-
-} // namespace llvm
diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index 2ee91a9..0f3978f 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -47,6 +47,7 @@
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Transforms/Utils/SSAUpdaterBulk.h"
#include <cassert>
#include <utility>
@@ -321,7 +322,7 @@ class StructurizeCFG {
void collectInfos();
- void insertConditions(bool Loops);
+ void insertConditions(bool Loops, SSAUpdaterBulk &PhiInserter);
void simplifyConditions();
@@ -671,10 +672,9 @@ void StructurizeCFG::collectInfos() {
}
/// Insert the missing branch conditions
-void StructurizeCFG::insertConditions(bool Loops) {
+void StructurizeCFG::insertConditions(bool Loops, SSAUpdaterBulk &PhiInserter) {
BranchVector &Conds = Loops ? LoopConds : Conditions;
Value *Default = Loops ? BoolTrue : BoolFalse;
- SSAUpdater PhiInserter;
for (BranchInst *Term : Conds) {
assert(Term->isConditional());
@@ -683,8 +683,9 @@ void StructurizeCFG::insertConditions(bool Loops) {
BasicBlock *SuccTrue = Term->getSuccessor(0);
BasicBlock *SuccFalse = Term->getSuccessor(1);
- PhiInserter.Initialize(Boolean, "");
- PhiInserter.AddAvailableValue(Loops ? SuccFalse : Parent, Default);
+ unsigned Variable = PhiInserter.AddVariable("", Boolean);
+ PhiInserter.AddAvailableValue(Variable, Loops ? SuccFalse : Parent,
+ Default);
BBPredicates &Preds = Loops ? LoopPreds[SuccFalse] : Predicates[SuccTrue];
@@ -697,7 +698,7 @@ void StructurizeCFG::insertConditions(bool Loops) {
ParentInfo = PI;
break;
}
- PhiInserter.AddAvailableValue(BB, PI.Pred);
+ PhiInserter.AddAvailableValue(Variable, BB, PI.Pred);
Dominator.addAndRememberBlock(BB);
}
@@ -706,9 +707,9 @@ void StructurizeCFG::insertConditions(bool Loops) {
CondBranchWeights::setMetadata(*Term, ParentInfo.Weights);
} else {
if (!Dominator.resultIsRememberedBlock())
- PhiInserter.AddAvailableValue(Dominator.result(), Default);
+ PhiInserter.AddAvailableValue(Variable, Dominator.result(), Default);
- Term->setCondition(PhiInserter.GetValueInMiddleOfBlock(Parent));
+ PhiInserter.AddUse(Variable, &Term->getOperandUse(0));
}
}
}
@@ -1414,8 +1415,12 @@ bool StructurizeCFG::run(Region *R, DominatorTree *DT,
orderNodes();
collectInfos();
createFlow();
- insertConditions(false);
- insertConditions(true);
+
+ SSAUpdaterBulk PhiInserter;
+ insertConditions(false, PhiInserter);
+ insertConditions(true, PhiInserter);
+ PhiInserter.RewriteAndOptimizeAllUses(*DT);
+
setPhiValues();
simplifyHoistedPhis();
simplifyConditions();
diff --git a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 1d83ddc..89d41f3e 100644
--- a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -192,7 +192,7 @@ struct AllocaDerivedValueTracker {
SmallPtrSet<Instruction *, 32> AllocaUsers;
SmallPtrSet<Instruction *, 32> EscapePoints;
};
-}
+} // namespace
static bool markTails(Function &F, OptimizationRemarkEmitter *ORE) {
if (F.callsFunctionThatReturnsTwice())
@@ -967,7 +967,7 @@ struct TailCallElim : public FunctionPass {
/*BFI=*/nullptr);
}
};
-}
+} // namespace
char TailCallElim::ID = 0;
INITIALIZE_PASS_BEGIN(TailCallElim, "tailcallelim", "Tail Call Elimination",
diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp
index b187208..32924e7 100644
--- a/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -44,7 +44,7 @@ using namespace llvm;
STATISTIC(RemappedAtomMax, "Highest global NextAtomGroup (after mapping)");
void llvm::mapAtomInstance(const DebugLoc &DL, ValueToValueMapTy &VMap) {
- auto CurGroup = DL->getAtomGroup();
+ uint64_t CurGroup = DL->getAtomGroup();
if (!CurGroup)
return;
@@ -62,21 +62,20 @@ void llvm::mapAtomInstance(const DebugLoc &DL, ValueToValueMapTy &VMap) {
RemappedAtomMax = std::max<uint64_t>(NewGroup, RemappedAtomMax);
}
-namespace {
-void collectDebugInfoFromInstructions(const Function &F,
- DebugInfoFinder &DIFinder) {
+static void collectDebugInfoFromInstructions(const Function &F,
+ DebugInfoFinder &DIFinder) {
const Module *M = F.getParent();
- if (M) {
- // Inspect instructions to process e.g. DILexicalBlocks of inlined functions
- for (const auto &I : instructions(F))
- DIFinder.processInstruction(*M, I);
- }
+ if (!M)
+ return;
+ // Inspect instructions to process e.g. DILexicalBlocks of inlined functions
+ for (const Instruction &I : instructions(F))
+ DIFinder.processInstruction(*M, I);
}
// Create a predicate that matches the metadata that should be identity mapped
// during function cloning.
-MetadataPredicate createIdentityMDPredicate(const Function &F,
- CloneFunctionChangeType Changes) {
+static MetadataPredicate
+createIdentityMDPredicate(const Function &F, CloneFunctionChangeType Changes) {
if (Changes >= CloneFunctionChangeType::DifferentModule)
return [](const Metadata *MD) { return false; };
@@ -107,7 +106,6 @@ MetadataPredicate createIdentityMDPredicate(const Function &F,
return false;
};
}
-} // namespace
/// See comments in Cloning.h.
BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap,
@@ -213,10 +211,9 @@ void llvm::CloneFunctionMetadataInto(Function &NewFunc, const Function &OldFunc,
const MetadataPredicate *IdentityMD) {
SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
OldFunc.getAllMetadata(MDs);
- for (auto MD : MDs) {
- NewFunc.addMetadata(MD.first,
- *MapMetadata(MD.second, VMap, RemapFlag, TypeMapper,
- Materializer, IdentityMD));
+ for (const auto &[Kind, MD] : MDs) {
+ NewFunc.addMetadata(Kind, *MapMetadata(MD, VMap, RemapFlag, TypeMapper,
+ Materializer, IdentityMD));
}
}
@@ -235,7 +232,6 @@ void llvm::CloneFunctionBodyInto(Function &NewFunc, const Function &OldFunc,
// appropriate. Note that we save BE this way in order to handle cloning of
// recursive functions into themselves.
for (const BasicBlock &BB : OldFunc) {
-
// Create a new basic block and copy instructions into it!
BasicBlock *CBB =
CloneBasicBlock(&BB, VMap, NameSuffix, &NewFunc, CodeInfo);
@@ -321,7 +317,7 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
// Cloning is always a Module level operation, since Metadata needs to be
// cloned.
- const auto RemapFlag = RF_None;
+ const RemapFlags RemapFlag = RF_None;
CloneFunctionMetadataInto(*NewFunc, *OldFunc, VMap, RemapFlag, TypeMapper,
Materializer, &IdentityMD);
@@ -346,8 +342,8 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
// visiting the metadata attached to global values, which would allow this
// code to be deleted. Alternatively, perhaps give responsibility for this
// update to CloneFunctionInto's callers.
- auto *NewModule = NewFunc->getParent();
- auto *NMD = NewModule->getOrInsertNamedMetadata("llvm.dbg.cu");
+ Module *NewModule = NewFunc->getParent();
+ NamedMDNode *NMD = NewModule->getOrInsertNamedMetadata("llvm.dbg.cu");
// Avoid multiple insertions of the same DICompileUnit to NMD.
SmallPtrSet<const void *, 8> Visited(llvm::from_range, NMD->operands());
@@ -355,7 +351,7 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
// the function (e.g. as instructions' scope).
DebugInfoFinder DIFinder;
collectDebugInfoFromInstructions(*OldFunc, DIFinder);
- for (auto *Unit : DIFinder.compile_units()) {
+ for (DICompileUnit *Unit : DIFinder.compile_units()) {
MDNode *MappedUnit =
MapMetadata(Unit, VMap, RF_None, TypeMapper, Materializer);
if (Visited.insert(MappedUnit).second)
@@ -821,17 +817,16 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
--PredCount[Pred];
// Figure out how many entries to remove from each PHI.
- for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
- ++PredCount[PN->getIncomingBlock(i)];
+ for (BasicBlock *Pred : PN->blocks())
+ ++PredCount[Pred];
// At this point, the excess predecessor entries are positive in the
// map. Loop over all of the PHIs and remove excess predecessor
// entries.
BasicBlock::iterator I = NewBB->begin();
for (; (PN = dyn_cast<PHINode>(I)); ++I) {
- for (const auto &PCI : PredCount) {
- BasicBlock *Pred = PCI.first;
- for (unsigned NumToRemove = PCI.second; NumToRemove; --NumToRemove)
+ for (const auto &[Pred, Count] : PredCount) {
+ for ([[maybe_unused]] unsigned _ : llvm::seq<unsigned>(Count))
PN->removeIncomingValue(Pred, false);
}
}
@@ -866,8 +861,8 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
// As phi-nodes have been now remapped, allow incremental simplification of
// newly-cloned instructions.
const DataLayout &DL = NewFunc->getDataLayout();
- for (const auto &BB : *OldFunc) {
- for (const auto &I : BB) {
+ for (const BasicBlock &BB : *OldFunc) {
+ for (const Instruction &I : BB) {
auto *NewI = dyn_cast_or_null<Instruction>(VMap.lookup(&I));
if (!NewI)
continue;
@@ -997,8 +992,8 @@ void llvm::CloneAndPruneFunctionInto(
void llvm::remapInstructionsInBlocks(ArrayRef<BasicBlock *> Blocks,
ValueToValueMapTy &VMap) {
// Rewrite the code to refer to itself.
- for (auto *BB : Blocks) {
- for (auto &Inst : *BB) {
+ for (BasicBlock *BB : Blocks) {
+ for (Instruction &Inst : *BB) {
RemapDbgRecordRange(Inst.getModule(), Inst.getDbgRecordRange(), VMap,
RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
RemapInstruction(&Inst, VMap,
@@ -1151,9 +1146,9 @@ void llvm::cloneNoAliasScopes(ArrayRef<MDNode *> NoAliasDeclScopes,
StringRef Ext, LLVMContext &Context) {
MDBuilder MDB(Context);
- for (auto *ScopeList : NoAliasDeclScopes) {
- for (const auto &MDOperand : ScopeList->operands()) {
- if (MDNode *MD = dyn_cast<MDNode>(MDOperand)) {
+ for (MDNode *ScopeList : NoAliasDeclScopes) {
+ for (const MDOperand &MDOp : ScopeList->operands()) {
+ if (MDNode *MD = dyn_cast<MDNode>(MDOp)) {
AliasScopeNode SNANode(MD);
std::string Name;
@@ -1177,7 +1172,7 @@ void llvm::adaptNoAliasScopes(Instruction *I,
auto CloneScopeList = [&](const MDNode *ScopeList) -> MDNode * {
bool NeedsReplacement = false;
SmallVector<Metadata *, 8> NewScopeList;
- for (const auto &MDOp : ScopeList->operands()) {
+ for (const MDOperand &MDOp : ScopeList->operands()) {
if (MDNode *MD = dyn_cast<MDNode>(MDOp)) {
if (auto *NewMD = ClonedScopes.lookup(MD)) {
NewScopeList.push_back(NewMD);
@@ -1193,12 +1188,12 @@ void llvm::adaptNoAliasScopes(Instruction *I,
};
if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(I))
- if (auto *NewScopeList = CloneScopeList(Decl->getScopeList()))
+ if (MDNode *NewScopeList = CloneScopeList(Decl->getScopeList()))
Decl->setScopeList(NewScopeList);
auto replaceWhenNeeded = [&](unsigned MD_ID) {
if (const MDNode *CSNoAlias = I->getMetadata(MD_ID))
- if (auto *NewScopeList = CloneScopeList(CSNoAlias))
+ if (MDNode *NewScopeList = CloneScopeList(CSNoAlias))
I->setMetadata(MD_ID, NewScopeList);
};
replaceWhenNeeded(LLVMContext::MD_noalias);
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index b18acea..4fe736a 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -1106,7 +1106,6 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
}
Phi.replaceAllUsesWith(RdxResult);
- continue;
}
}
diff --git a/llvm/lib/Transforms/Utils/SSAUpdaterBulk.cpp b/llvm/lib/Transforms/Utils/SSAUpdaterBulk.cpp
index d7bf791..fb39fdd 100644
--- a/llvm/lib/Transforms/Utils/SSAUpdaterBulk.cpp
+++ b/llvm/lib/Transforms/Utils/SSAUpdaterBulk.cpp
@@ -11,11 +11,11 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Utils/SSAUpdaterBulk.h"
+#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/IteratedDominanceFrontier.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
#include "llvm/IR/Use.h"
#include "llvm/IR/Value.h"
@@ -112,7 +112,7 @@ struct BBValueInfo {
void SSAUpdaterBulk::RewriteAllUses(DominatorTree *DT,
SmallVectorImpl<PHINode *> *InsertedPHIs) {
DenseMap<BasicBlock *, BBValueInfo> BBInfos;
- for (auto &R : Rewrites) {
+ for (RewriteInfo &R : Rewrites) {
BBInfos.clear();
// Compute locations for new phi-nodes.
@@ -145,7 +145,7 @@ void SSAUpdaterBulk::RewriteAllUses(DominatorTree *DT,
BBInfos[BB].LiveOutValue = V;
// We've computed IDF, now insert new phi-nodes there.
- for (auto *FrontierBB : IDFBlocks) {
+ for (BasicBlock *FrontierBB : IDFBlocks) {
IRBuilder<> B(FrontierBB, FrontierBB->begin());
PHINode *PN = B.CreatePHI(R.Ty, 0, R.Name);
BBInfos[FrontierBB].LiveInValue = PN;
@@ -156,7 +156,7 @@ void SSAUpdaterBulk::RewriteAllUses(DominatorTree *DT,
// IsLiveOut indicates whether we are computing live-out values (true) or
// live-in values (false).
auto ComputeValue = [&](BasicBlock *BB, bool IsLiveOut) -> Value * {
- auto *BBInfo = &BBInfos[BB];
+ BBValueInfo *BBInfo = &BBInfos[BB];
if (IsLiveOut && BBInfo->LiveOutValue)
return BBInfo->LiveOutValue;
@@ -187,7 +187,7 @@ void SSAUpdaterBulk::RewriteAllUses(DominatorTree *DT,
if (!V)
V = UndefValue::get(R.Ty);
- for (auto *BBInfo : Stack)
+ for (BBValueInfo *BBInfo : Stack)
// Loop above can insert new entries into the BBInfos map: assume the
// map shouldn't grow due to [1] and BBInfo references are valid.
BBInfo->LiveInValue = V;
@@ -196,7 +196,7 @@ void SSAUpdaterBulk::RewriteAllUses(DominatorTree *DT,
};
// Fill in arguments of the inserted PHIs.
- for (auto *BB : IDFBlocks) {
+ for (BasicBlock *BB : IDFBlocks) {
auto *PHI = cast<PHINode>(&BB->front());
for (BasicBlock *Pred : PredCache.get(BB))
PHI->addIncoming(ComputeValue(Pred, /*IsLiveOut=*/true), Pred);
@@ -222,3 +222,96 @@ void SSAUpdaterBulk::RewriteAllUses(DominatorTree *DT,
}
}
}
+
+// Perform a single pass of simplification over the worklist of PHIs.
+// This should be called after RewriteAllUses() because simplifying PHIs
+// immediately after creation would require updating all references to those
+// PHIs in the BBValueInfo structures, which would necessitate additional
+// reference tracking overhead.
+static void simplifyPass(MutableArrayRef<PHINode *> Worklist,
+ const DataLayout &DL) {
+ for (PHINode *&PHI : Worklist) {
+ if (Value *Simplified = simplifyInstruction(PHI, DL)) {
+ PHI->replaceAllUsesWith(Simplified);
+ PHI->eraseFromParent();
+ PHI = nullptr; // Mark as removed.
+ }
+ }
+}
+
+#ifndef NDEBUG // Should this be under EXPENSIVE_CHECKS?
+// New PHI nodes should not reference one another but they may reference
+// themselves or existing PHI nodes, and existing PHI nodes may reference new
+// PHI nodes.
+static bool
+PHIAreRefEachOther(const iterator_range<BasicBlock::phi_iterator> NewPHIs) {
+ SmallPtrSet<PHINode *, 8> NewPHISet;
+ for (PHINode &PN : NewPHIs)
+ NewPHISet.insert(&PN);
+ for (PHINode &PHI : NewPHIs) {
+ for (Value *V : PHI.incoming_values()) {
+ PHINode *IncPHI = dyn_cast<PHINode>(V);
+ if (IncPHI && IncPHI != &PHI && NewPHISet.contains(IncPHI))
+ return true;
+ }
+ }
+ return false;
+}
+#endif
+
+static bool replaceIfIdentical(PHINode &PHI, PHINode &ReplPHI) {
+ if (!PHI.isIdenticalToWhenDefined(&ReplPHI))
+ return false;
+ PHI.replaceAllUsesWith(&ReplPHI);
+ PHI.eraseFromParent();
+ return true;
+}
+
+bool EliminateNewDuplicatePHINodes(BasicBlock *BB,
+ BasicBlock::phi_iterator FirstExistingPN) {
+ assert(!PHIAreRefEachOther(make_range(BB->phis().begin(), FirstExistingPN)));
+
+ // Deduplicate new PHIs first to reduce the number of comparisons on the
+ // following new -> existing pass.
+ bool Changed = false;
+ for (auto I = BB->phis().begin(); I != FirstExistingPN; ++I) {
+ for (auto J = std::next(I); J != FirstExistingPN;) {
+ Changed |= replaceIfIdentical(*J++, *I);
+ }
+ }
+
+ // Iterate over existing PHIs and replace identical new PHIs.
+ for (PHINode &ExistingPHI : make_range(FirstExistingPN, BB->phis().end())) {
+ auto I = BB->phis().begin();
+ assert(I != FirstExistingPN); // Should be at least one new PHI.
+ do {
+ Changed |= replaceIfIdentical(*I++, ExistingPHI);
+ } while (I != FirstExistingPN);
+ if (BB->phis().begin() == FirstExistingPN)
+ return Changed;
+ }
+ return Changed;
+}
+
+static void deduplicatePass(ArrayRef<PHINode *> Worklist) {
+ SmallDenseMap<BasicBlock *, unsigned> BBs;
+ for (PHINode *PHI : Worklist) {
+ if (PHI)
+ ++BBs[PHI->getParent()];
+ }
+
+ for (auto [BB, NumNewPHIs] : BBs) {
+ auto FirstExistingPN = std::next(BB->phis().begin(), NumNewPHIs);
+ EliminateNewDuplicatePHINodes(BB, FirstExistingPN);
+ }
+}
+
+void SSAUpdaterBulk::RewriteAndOptimizeAllUses(DominatorTree &DT) {
+ SmallVector<PHINode *, 4> PHIs;
+ RewriteAllUses(&DT, &PHIs);
+ if (PHIs.empty())
+ return;
+
+ simplifyPass(PHIs, PHIs.front()->getParent()->getDataLayout());
+ deduplicatePass(PHIs);
+}
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 45cee1e..9035e58 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -526,7 +526,7 @@ Value *SCEVExpander::visitAddExpr(const SCEVAddExpr *S) {
// Recognize the canonical representation of an unsimplifed urem.
const SCEV *URemLHS = nullptr;
const SCEV *URemRHS = nullptr;
- if (SE.matchURem(S, URemLHS, URemRHS)) {
+ if (match(S, m_scev_URem(m_SCEV(URemLHS), m_SCEV(URemRHS), SE))) {
Value *LHS = expand(URemLHS);
Value *RHS = expand(URemRHS);
return InsertBinop(Instruction::URem, LHS, RHS, SCEV::FlagAnyWrap,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b96d29e..280eb20 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7957,9 +7957,9 @@ bool VPRecipeBuilder::getScaledReductions(
auto CollectExtInfo = [this, &Exts, &ExtOpTypes,
&ExtKinds](SmallVectorImpl<Value *> &Ops) -> bool {
for (const auto &[I, OpI] : enumerate(Ops)) {
- auto *CI = dyn_cast<ConstantInt>(OpI);
- if (I > 0 && CI &&
- canConstantBeExtended(CI, ExtOpTypes[0], ExtKinds[0])) {
+ const APInt *C;
+ if (I > 0 && match(OpI, m_APInt(C)) &&
+ canConstantBeExtended(C, ExtOpTypes[0], ExtKinds[0])) {
ExtOpTypes[I] = ExtOpTypes[0];
ExtKinds[I] = ExtKinds[0];
continue;
@@ -8240,14 +8240,14 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// the vector loop or when not folding the tail. In the later case, we know
// that the canonical induction increment will not overflow as the vector trip
// count is >= increment and a multiple of the increment.
+ VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion();
bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
if (!HasNUW) {
- auto *IVInc = Plan->getVectorLoopRegion()
- ->getExitingBasicBlock()
- ->getTerminator()
- ->getOperand(0);
- assert(match(IVInc, m_VPInstruction<Instruction::Add>(
- m_Specific(Plan->getCanonicalIV()), m_VPValue())) &&
+ auto *IVInc =
+ LoopRegion->getExitingBasicBlock()->getTerminator()->getOperand(0);
+ assert(match(IVInc,
+ m_VPInstruction<Instruction::Add>(
+ m_Specific(LoopRegion->getCanonicalIV()), m_VPValue())) &&
"Did not find the canonical IV increment");
cast<VPRecipeWithIRFlags>(IVInc)->dropPoisonGeneratingFlags();
}
@@ -8293,7 +8293,6 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// Scan the body of the loop in a topological order to visit each basic block
// after having visited its predecessor basic blocks.
- VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion();
VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
HeaderVPBB);
@@ -8377,8 +8376,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
for (VPValue *Old : Old2New.keys())
Old->getDefiningRecipe()->eraseFromParent();
- assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
- !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
+ assert(isa<VPRegionBlock>(LoopRegion) &&
+ !LoopRegion->getEntryBasicBlock()->empty() &&
"entry block must be set to a VPRegionBlock having a non-empty entry "
"VPBasicBlock");
@@ -9326,8 +9325,9 @@ static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
if (ResumePhiIter == MainScalarPH->phis().end()) {
VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
ResumePhi = ScalarPHBuilder.createScalarPhi(
- {VectorTC, MainPlan.getCanonicalIV()->getStartValue()}, {},
- "vec.epilog.resume.val");
+ {VectorTC,
+ MainPlan.getVectorLoopRegion()->getCanonicalIV()->getStartValue()},
+ {}, "vec.epilog.resume.val");
} else {
ResumePhi = cast<VPPhi>(&*ResumePhiIter);
if (MainScalarPH->begin() == MainScalarPH->end())
@@ -9354,7 +9354,7 @@ static SmallVector<Instruction *> preparePlanForEpilogueVectorLoop(
VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
Header->setName("vec.epilog.vector.body");
- VPCanonicalIVPHIRecipe *IV = Plan.getCanonicalIV();
+ VPCanonicalIVPHIRecipe *IV = VectorLoop->getCanonicalIV();
// When vectorizing the epilogue loop, the canonical induction needs to be
// adjusted by the value after the main vector loop. Find the resume value
// created during execution of the main VPlan. It must be the first phi in the
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a6f4bec..b62c8f1 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -10659,7 +10659,8 @@ class InstructionsCompatibilityAnalysis {
static bool isSupportedOpcode(const unsigned Opcode) {
return Opcode == Instruction::Add || Opcode == Instruction::LShr ||
Opcode == Instruction::Shl || Opcode == Instruction::SDiv ||
- Opcode == Instruction::UDiv;
+ Opcode == Instruction::UDiv || Opcode == Instruction::And ||
+ Opcode == Instruction::Or || Opcode == Instruction::Xor;
}
/// Identifies the best candidate value, which represents main opcode
@@ -10984,6 +10985,9 @@ public:
case Instruction::Shl:
case Instruction::SDiv:
case Instruction::UDiv:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
break;
default:
@@ -17628,7 +17632,9 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
}
if (IsPHI ||
(!E->isGather() && E->State != TreeEntry::SplitVectorize &&
- E->doesNotNeedToSchedule()) ||
+ (E->doesNotNeedToSchedule() ||
+ (E->hasCopyableElements() && !E->isCopyableElement(LastInst) &&
+ isUsedOutsideBlock(LastInst)))) ||
(GatheredLoadsEntriesFirst.has_value() &&
E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
E->getOpcode() == Instruction::Load)) {
@@ -19456,7 +19462,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
assert(getNumElements(Cond->getType()) == TrueNumElements &&
"Cannot vectorize Instruction::Select");
- Value *V = Builder.CreateSelect(Cond, True, False);
+ Value *V =
+ Builder.CreateSelectWithUnknownProfile(Cond, True, False, DEBUG_TYPE);
V = FinalShuffle(V, E);
E->VectorizedValue = V;
@@ -23576,18 +23583,19 @@ class HorizontalReduction {
switch (Kind) {
case RecurKind::Or: {
if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
- return Builder.CreateSelect(
+ return Builder.CreateSelectWithUnknownProfile(
LHS, ConstantInt::getAllOnesValue(CmpInst::makeCmpResultType(OpTy)),
- RHS, Name);
+ RHS, DEBUG_TYPE, Name);
unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
Name);
}
case RecurKind::And: {
if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
- return Builder.CreateSelect(
+ return Builder.CreateSelectWithUnknownProfile(
LHS, RHS,
- ConstantInt::getNullValue(CmpInst::makeCmpResultType(OpTy)), Name);
+ ConstantInt::getNullValue(CmpInst::makeCmpResultType(OpTy)),
+ DEBUG_TYPE, Name);
unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
Name);
@@ -23608,7 +23616,8 @@ class HorizontalReduction {
if (UseSelect) {
CmpInst::Predicate Pred = llvm::getMinMaxReductionPredicate(Kind);
Value *Cmp = Builder.CreateICmp(Pred, LHS, RHS, Name);
- return Builder.CreateSelect(Cmp, LHS, RHS, Name);
+ return Builder.CreateSelectWithUnknownProfile(Cmp, LHS, RHS, DEBUG_TYPE,
+ Name);
}
[[fallthrough]];
case RecurKind::FMax:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 1fea068..d167009 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -635,9 +635,9 @@ static bool hasConditionalTerminator(const VPBasicBlock *VPBB) {
const VPRecipeBase *R = &VPBB->back();
bool IsSwitch = isa<VPInstruction>(R) &&
cast<VPInstruction>(R)->getOpcode() == Instruction::Switch;
- bool IsCondBranch = isa<VPBranchOnMaskRecipe>(R) ||
- match(R, m_BranchOnCond(m_VPValue())) ||
- match(R, m_BranchOnCount(m_VPValue(), m_VPValue()));
+ bool IsCondBranch =
+ isa<VPBranchOnMaskRecipe>(R) ||
+ match(R, m_CombineOr(m_BranchOnCond(), m_BranchOnCount()));
(void)IsCondBranch;
(void)IsSwitch;
if (VPBB->getNumSuccessors() == 2 ||
@@ -1753,14 +1753,14 @@ void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
}
#endif
-bool llvm::canConstantBeExtended(const ConstantInt *CI, Type *NarrowType,
+bool llvm::canConstantBeExtended(const APInt *C, Type *NarrowType,
TTI::PartialReductionExtendKind ExtKind) {
- APInt TruncatedVal = CI->getValue().trunc(NarrowType->getScalarSizeInBits());
- unsigned WideSize = CI->getType()->getScalarSizeInBits();
+ APInt TruncatedVal = C->trunc(NarrowType->getScalarSizeInBits());
+ unsigned WideSize = C->getBitWidth();
APInt ExtendedVal = ExtKind == TTI::PR_SignExtend
? TruncatedVal.sext(WideSize)
: TruncatedVal.zext(WideSize);
- return ExtendedVal == CI->getValue();
+ return ExtendedVal == *C;
}
TargetTransformInfo::OperandValueInfo
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index fb696be..0e0b042 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -24,12 +24,9 @@
#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
-#include "VPlanAnalysis.h"
#include "VPlanValue.h"
#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallBitVector.h"
#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Twine.h"
#include "llvm/ADT/ilist.h"
@@ -41,10 +38,11 @@
#include "llvm/IR/Operator.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/InstructionCost.h"
-#include <algorithm>
#include <cassert>
#include <cstddef>
+#include <functional>
#include <string>
+#include <utility>
namespace llvm {
@@ -346,13 +344,6 @@ public:
/// Return the cost of the block.
virtual InstructionCost cost(ElementCount VF, VPCostContext &Ctx) = 0;
- /// Return true if it is legal to hoist instructions into this block.
- bool isLegalToHoistInto() {
- // There are currently no constraints that prevent an instruction to be
- // hoisted into a VPBlockBase.
- return true;
- }
-
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void printAsOperand(raw_ostream &OS, bool PrintType = false) const {
OS << getName();
@@ -1021,6 +1012,8 @@ public:
// part if scalar. In the latter case, the recipe will be removed during
// unrolling.
ExtractLastElement,
+ // Extracts the last lane for each part from its operand.
+ ExtractLastLanePerPart,
// Extracts the second-to-last lane from its operand or the second-to-last
// part if it is scalar. In the latter case, the recipe will be removed
// during unrolling.
@@ -1064,6 +1057,7 @@ public:
ResumeForEpilogue,
/// Returns the value for vscale.
VScale,
+ OpsEnd = VScale,
};
/// Returns true if this VPInstruction generates scalar values for all lanes.
@@ -4066,6 +4060,19 @@ public:
/// Remove the current region from its VPlan, connecting its predecessor to
/// its entry, and its exiting block to its successor.
void dissolveToCFGLoop();
+
+ /// Returns the canonical induction recipe of the region.
+ VPCanonicalIVPHIRecipe *getCanonicalIV() {
+ VPBasicBlock *EntryVPBB = getEntryBasicBlock();
+ if (EntryVPBB->empty()) {
+ // VPlan native path. TODO: Unify both code paths.
+ EntryVPBB = cast<VPBasicBlock>(EntryVPBB->getSingleSuccessor());
+ }
+ return cast<VPCanonicalIVPHIRecipe>(&*EntryVPBB->begin());
+ }
+ const VPCanonicalIVPHIRecipe *getCanonicalIV() const {
+ return const_cast<VPRegionBlock *>(this)->getCanonicalIV();
+ }
};
/// VPlan models a candidate for vectorization, encoding various decisions take
@@ -4260,12 +4267,14 @@ public:
BackedgeTakenCount = new VPValue();
return BackedgeTakenCount;
}
+ VPValue *getBackedgeTakenCount() const { return BackedgeTakenCount; }
/// The vector trip count.
VPValue &getVectorTripCount() { return VectorTripCount; }
/// Returns the VF of the vector loop region.
VPValue &getVF() { return VF; };
+ const VPValue &getVF() const { return VF; };
/// Returns VF * UF of the vector loop region.
VPValue &getVFxUF() { return VFxUF; }
@@ -4377,16 +4386,6 @@ public:
LLVM_DUMP_METHOD void dump() const;
#endif
- /// Returns the canonical induction recipe of the vector loop.
- VPCanonicalIVPHIRecipe *getCanonicalIV() {
- VPBasicBlock *EntryVPBB = getVectorLoopRegion()->getEntryBasicBlock();
- if (EntryVPBB->empty()) {
- // VPlan native path.
- EntryVPBB = cast<VPBasicBlock>(EntryVPBB->getSingleSuccessor());
- }
- return cast<VPCanonicalIVPHIRecipe>(&*EntryVPBB->begin());
- }
-
VPValue *getSCEVExpansion(const SCEV *S) const {
return SCEVToExpansion.lookup(S);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 07bfe7a..f413c63 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -116,6 +116,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
case VPInstruction::FirstActiveLane:
return Type::getIntNTy(Ctx, 64);
case VPInstruction::ExtractLastElement:
+ case VPInstruction::ExtractLastLanePerPart:
case VPInstruction::ExtractPenultimateElement: {
Type *BaseTy = inferScalarType(R->getOperand(0));
if (auto *VecTy = dyn_cast<VectorType>(BaseTy))
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 81deba2..332791a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -433,8 +433,7 @@ static void addCanonicalIVRecipes(VPlan &Plan, VPBasicBlock *HeaderVPBB,
// We are about to replace the branch to exit the region. Remove the original
// BranchOnCond, if there is any.
DebugLoc LatchDL = DL;
- if (!LatchVPBB->empty() &&
- match(&LatchVPBB->back(), m_BranchOnCond(m_VPValue()))) {
+ if (!LatchVPBB->empty() && match(&LatchVPBB->back(), m_BranchOnCond())) {
LatchDL = LatchVPBB->getTerminator()->getDebugLoc();
LatchVPBB->getTerminator()->eraseFromParent();
}
@@ -480,8 +479,7 @@ static void createExtractsForLiveOuts(VPlan &Plan, VPBasicBlock *MiddleVPBB) {
static void addInitialSkeleton(VPlan &Plan, Type *InductionTy, DebugLoc IVDL,
PredicatedScalarEvolution &PSE, Loop *TheLoop) {
- VPDominatorTree VPDT;
- VPDT.recalculate(Plan);
+ VPDominatorTree VPDT(Plan);
auto *HeaderVPBB = cast<VPBasicBlock>(Plan.getEntry()->getSingleSuccessor());
canonicalHeaderAndLatch(HeaderVPBB, VPDT);
@@ -623,8 +621,7 @@ void VPlanTransforms::addMiddleCheck(VPlan &Plan,
}
void VPlanTransforms::createLoopRegions(VPlan &Plan) {
- VPDominatorTree VPDT;
- VPDT.recalculate(Plan);
+ VPDominatorTree VPDT(Plan);
for (VPBlockBase *HeaderVPB : vp_post_order_shallow(Plan.getEntry()))
if (canonicalHeaderAndLatch(HeaderVPB, VPDT))
createLoopRegion(Plan, HeaderVPB);
@@ -661,9 +658,11 @@ void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
}
VPIRMetadata VPBranchWeights;
- auto *Term = VPBuilder(CheckBlockVPBB)
- .createNaryOp(VPInstruction::BranchOnCond, {CondVPV},
- Plan.getCanonicalIV()->getDebugLoc());
+ auto *Term =
+ VPBuilder(CheckBlockVPBB)
+ .createNaryOp(
+ VPInstruction::BranchOnCond, {CondVPV},
+ Plan.getVectorLoopRegion()->getCanonicalIV()->getDebugLoc());
if (AddBranchWeights) {
MDBuilder MDB(Plan.getContext());
MDNode *BranchWeights =
@@ -875,8 +874,7 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
Plan.getVectorLoopRegion()->getEntryBasicBlock())) {
auto *VPBB = cast<VPBasicBlock>(VPB);
for (auto &R : *VPBB) {
- if (R.mayWriteToMemory() &&
- !match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
+ if (R.mayWriteToMemory() && !match(&R, m_BranchOnCount()))
return false;
}
}
@@ -925,8 +923,8 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
if (auto *DerivedIV = dyn_cast<VPDerivedIVRecipe>(VecV)) {
if (DerivedIV->getNumUsers() == 1 &&
DerivedIV->getOperand(1) == &Plan.getVectorTripCount()) {
- auto *NewSel = Builder.createSelect(AnyNaN, Plan.getCanonicalIV(),
- &Plan.getVectorTripCount());
+ auto *NewSel = Builder.createSelect(
+ AnyNaN, LoopRegion->getCanonicalIV(), &Plan.getVectorTripCount());
DerivedIV->moveAfter(&*Builder.getInsertPoint());
DerivedIV->setOperand(1, NewSel);
continue;
@@ -939,7 +937,8 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
"FMaxNum/FMinNum reduction.\n");
return false;
}
- auto *NewSel = Builder.createSelect(AnyNaN, Plan.getCanonicalIV(), VecV);
+ auto *NewSel =
+ Builder.createSelect(AnyNaN, LoopRegion->getCanonicalIV(), VecV);
ResumeR->setOperand(0, NewSel);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h b/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h
index 577432f..44506f5a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h
@@ -39,7 +39,6 @@ class VPDominatorTree : public DominatorTreeBase<VPBlockBase, false> {
using Base = DominatorTreeBase<VPBlockBase, false>;
public:
- VPDominatorTree() = default;
explicit VPDominatorTree(VPlan &Plan) { recalculate(Plan); }
/// Returns true if \p A properly dominates \p B.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
index 1580a3b..2aaabd9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
@@ -474,7 +474,7 @@ public:
/// Check if a constant \p CI can be safely treated as having been extended
/// from a narrower type with the given extension kind.
-bool canConstantBeExtended(const ConstantInt *CI, Type *NarrowType,
+bool canConstantBeExtended(const APInt *C, Type *NarrowType,
TTI::PartialReductionExtendKind ExtKind);
} // end namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index 555efea..d8203e2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -173,10 +173,10 @@ inline int_pred_ty<is_zero_int> m_ZeroInt() {
/// For vectors, this includes constants with undefined elements.
inline int_pred_ty<is_one> m_One() { return int_pred_ty<is_one>(); }
-struct bind_const_int {
- uint64_t &Res;
+struct bind_apint {
+ const APInt *&Res;
- bind_const_int(uint64_t &Res) : Res(Res) {}
+ bind_apint(const APInt *&Res) : Res(Res) {}
bool match(VPValue *VPV) const {
if (!VPV->isLiveIn())
@@ -188,7 +188,23 @@ struct bind_const_int {
const auto *CI = dyn_cast<ConstantInt>(V);
if (!CI)
return false;
- if (auto C = CI->getValue().tryZExtValue()) {
+ Res = &CI->getValue();
+ return true;
+ }
+};
+
+inline bind_apint m_APInt(const APInt *&C) { return C; }
+
+struct bind_const_int {
+ uint64_t &Res;
+
+ bind_const_int(uint64_t &Res) : Res(Res) {}
+
+ bool match(VPValue *VPV) const {
+ const APInt *APConst;
+ if (!bind_apint(APConst).match(VPV))
+ return false;
+ if (auto C = APConst->tryZExtValue()) {
Res = *C;
return true;
}
@@ -344,6 +360,10 @@ m_Freeze(const Op0_t &Op0) {
return m_VPInstruction<Instruction::Freeze>(Op0);
}
+inline VPInstruction_match<VPInstruction::BranchOnCond> m_BranchOnCond() {
+ return m_VPInstruction<VPInstruction::BranchOnCond>();
+}
+
template <typename Op0_t>
inline VPInstruction_match<VPInstruction::BranchOnCond, Op0_t>
m_BranchOnCond(const Op0_t &Op0) {
@@ -368,12 +388,22 @@ m_ExtractLastElement(const Op0_t &Op0) {
return m_VPInstruction<VPInstruction::ExtractLastElement>(Op0);
}
+template <typename Op0_t>
+inline VPInstruction_match<VPInstruction::ExtractLastLanePerPart, Op0_t>
+m_ExtractLastLanePerPart(const Op0_t &Op0) {
+ return m_VPInstruction<VPInstruction::ExtractLastLanePerPart>(Op0);
+}
+
template <typename Op0_t, typename Op1_t, typename Op2_t>
inline VPInstruction_match<VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t>
m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) {
return m_VPInstruction<VPInstruction::ActiveLaneMask>(Op0, Op1, Op2);
}
+inline VPInstruction_match<VPInstruction::BranchOnCount> m_BranchOnCount() {
+ return m_VPInstruction<VPInstruction::BranchOnCount>();
+}
+
template <typename Op0_t, typename Op1_t>
inline VPInstruction_match<VPInstruction::BranchOnCount, Op0_t, Op1_t>
m_BranchOnCount(const Op0_t &Op0, const Op1_t &Op1) {
@@ -386,6 +416,12 @@ m_AnyOf(const Op0_t &Op0) {
return m_VPInstruction<VPInstruction::AnyOf>(Op0);
}
+template <typename Op0_t>
+inline VPInstruction_match<VPInstruction::FirstActiveLane, Op0_t>
+m_FirstActiveLane(const Op0_t &Op0) {
+ return m_VPInstruction<VPInstruction::FirstActiveLane>(Op0);
+}
+
template <unsigned Opcode, typename Op0_t>
inline AllRecipe_match<Opcode, Op0_t> m_Unary(const Op0_t &Op0) {
return AllRecipe_match<Opcode, Op0_t>(Op0);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index 0c27d53..fb17d5d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -168,7 +168,8 @@ void VPPredicator::createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail) {
// non-phi instructions.
auto &Plan = *HeaderVPBB->getPlan();
- auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
+ auto *IV =
+ new VPWidenCanonicalIVRecipe(HeaderVPBB->getParent()->getCanonicalIV());
Builder.setInsertPoint(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
Builder.insert(IV);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 8e916772..7a98c75 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -341,12 +341,12 @@ VPPartialReductionRecipe::computeCost(ElementCount VF,
ExtAType = GetExtendKind(ExtAR);
ExtBType = GetExtendKind(ExtBR);
- if (!ExtBR && Widen->getOperand(1)->isLiveIn()) {
- auto *CI = cast<ConstantInt>(Widen->getOperand(1)->getLiveInIRValue());
- if (canConstantBeExtended(CI, InputTypeA, ExtAType)) {
- InputTypeB = InputTypeA;
- ExtBType = ExtAType;
- }
+ using namespace VPlanPatternMatch;
+ const APInt *C;
+ if (!ExtBR && match(Widen->getOperand(1), m_APInt(C)) &&
+ canConstantBeExtended(C, InputTypeA, ExtAType)) {
+ InputTypeB = InputTypeA;
+ ExtBType = ExtAType;
}
};
@@ -511,6 +511,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::ExplicitVectorLength:
case VPInstruction::ExtractLastElement:
+ case VPInstruction::ExtractLastLanePerPart:
case VPInstruction::ExtractPenultimateElement:
case VPInstruction::FirstActiveLane:
case VPInstruction::Not:
@@ -878,9 +879,11 @@ Value *VPInstruction::generate(VPTransformState &State) {
return ReducedPartRdx;
}
+ case VPInstruction::ExtractLastLanePerPart:
case VPInstruction::ExtractLastElement:
case VPInstruction::ExtractPenultimateElement: {
- unsigned Offset = getOpcode() == VPInstruction::ExtractLastElement ? 1 : 2;
+ unsigned Offset =
+ getOpcode() == VPInstruction::ExtractPenultimateElement ? 2 : 1;
Value *Res;
if (State.VF.isVector()) {
assert(Offset <= State.VF.getKnownMinValue() &&
@@ -1154,7 +1157,7 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
case VPInstruction::ExtractPenultimateElement:
if (VF == ElementCount::getScalable(1))
return InstructionCost::getInvalid();
- LLVM_FALLTHROUGH;
+ [[fallthrough]];
default:
// TODO: Compute cost other VPInstructions once the legacy cost model has
// been retired.
@@ -1166,6 +1169,7 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
bool VPInstruction::isVectorToScalar() const {
return getOpcode() == VPInstruction::ExtractLastElement ||
+ getOpcode() == VPInstruction::ExtractLastLanePerPart ||
getOpcode() == VPInstruction::ExtractPenultimateElement ||
getOpcode() == Instruction::ExtractElement ||
getOpcode() == VPInstruction::ExtractLane ||
@@ -1229,6 +1233,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::ExtractLane:
case VPInstruction::ExtractLastElement:
+ case VPInstruction::ExtractLastLanePerPart:
case VPInstruction::ExtractPenultimateElement:
case VPInstruction::ActiveLaneMask:
case VPInstruction::FirstActiveLane:
@@ -1376,6 +1381,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::ExtractLastElement:
O << "extract-last-element";
break;
+ case VPInstruction::ExtractLastLanePerPart:
+ O << "extract-last-lane-per-part";
+ break;
case VPInstruction::ExtractPenultimateElement:
O << "extract-penultimate-element";
break;
@@ -2344,7 +2352,7 @@ bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
return false;
auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
- auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin());
+ auto *CanIV = getParent()->getParent()->getCanonicalIV();
return StartC && StartC->isZero() && StepC && StepC->isOne() &&
getScalarType() == CanIV->getScalarType();
}
@@ -2855,7 +2863,7 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF,
case ExpressionTypes::ExtNegatedMulAccReduction:
assert(Opcode == Instruction::Add && "Unexpected opcode");
Opcode = Instruction::Sub;
- LLVM_FALLTHROUGH;
+ [[fallthrough]];
case ExpressionTypes::ExtMulAccReduction: {
return Ctx.TTI.getMulAccReductionCost(
cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 9bb8820..cae9aee8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -501,7 +501,8 @@ static void removeRedundantInductionCasts(VPlan &Plan) {
/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
/// recipe, if it exists.
static void removeRedundantCanonicalIVs(VPlan &Plan) {
- VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
+ VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+ VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
for (VPUser *U : CanonicalIV->users()) {
WidenNewIV = dyn_cast<VPWidenCanonicalIVRecipe>(U);
@@ -512,7 +513,7 @@ static void removeRedundantCanonicalIVs(VPlan &Plan) {
if (!WidenNewIV)
return;
- VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
+ VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
@@ -582,8 +583,9 @@ createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind,
FPMathOperator *FPBinOp, Instruction *TruncI,
VPValue *StartV, VPValue *Step, DebugLoc DL,
VPBuilder &Builder) {
- VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
- VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
+ VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+ VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
+ VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
@@ -786,9 +788,7 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
ScalarEvolution &SE) {
VPValue *Incoming, *Mask;
if (!match(Op, m_VPInstruction<VPInstruction::ExtractLane>(
- m_VPInstruction<VPInstruction::FirstActiveLane>(
- m_VPValue(Mask)),
- m_VPValue(Incoming))))
+ m_FirstActiveLane(m_VPValue(Mask)), m_VPValue(Incoming))))
return nullptr;
auto *WideIV = getOptimizableIVOf(Incoming, SE);
@@ -800,8 +800,9 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
return nullptr;
// Calculate the final index.
- VPValue *EndValue = Plan.getCanonicalIV();
- auto CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
+ VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+ auto *CanonicalIV = LoopRegion->getCanonicalIV();
+ Type *CanonicalIVType = CanonicalIV->getScalarType();
VPBuilder B(cast<VPBasicBlock>(PredVPBB));
DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
@@ -810,7 +811,8 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
FirstActiveLaneType, DL);
- EndValue = B.createNaryOp(Instruction::Add, {EndValue, FirstActiveLane}, DL);
+ VPValue *EndValue =
+ B.createNaryOp(Instruction::Add, {CanonicalIV, FirstActiveLane}, DL);
// `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
// changed it means the exit is using the incremented value, so we need to
@@ -1205,7 +1207,8 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
}
// Look through ExtractLastElement (BuildVector ....).
- if (match(&R, m_ExtractLastElement(m_BuildVector()))) {
+ if (match(&R, m_CombineOr(m_ExtractLastElement(m_BuildVector()),
+ m_ExtractLastLanePerPart(m_BuildVector())))) {
auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
Def->replaceAllUsesWith(
BuildVector->getOperand(BuildVector->getNumOperands() - 1));
@@ -1271,13 +1274,15 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
return;
}
- if (match(Def, m_ExtractLastElement(m_Broadcast(m_VPValue(A))))) {
+ if (match(Def,
+ m_CombineOr(m_ExtractLastElement(m_Broadcast(m_VPValue(A))),
+ m_ExtractLastLanePerPart(m_Broadcast(m_VPValue(A)))))) {
Def->replaceAllUsesWith(A);
return;
}
- if (match(Def,
- m_VPInstruction<VPInstruction::ExtractLastElement>(m_VPValue(A))) &&
+ if (match(Def, m_CombineOr(m_ExtractLastElement(m_VPValue(A)),
+ m_ExtractLastLanePerPart(m_VPValue(A)))) &&
((isa<VPInstruction>(A) && vputils::isSingleScalar(A)) ||
(isa<VPReplicateRecipe>(A) &&
cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
@@ -1285,6 +1290,12 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
[Def, A](VPUser *U) { return U->usesScalars(A) || Def == U; })) {
return Def->replaceAllUsesWith(A);
}
+
+ if (Plan->getUF() == 1 &&
+ match(Def, m_ExtractLastLanePerPart(m_VPValue(A)))) {
+ return Def->replaceAllUsesWith(
+ Builder.createNaryOp(VPInstruction::ExtractLastElement, {A}));
+ }
}
void VPlanTransforms::simplifyRecipes(VPlan &Plan) {
@@ -1322,8 +1333,11 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Metadata*/);
Clone->insertBefore(RepOrWidenR);
- auto *Ext = new VPInstruction(VPInstruction::ExtractLastElement,
- {Clone->getOperand(0)});
+ unsigned ExtractOpc =
+ vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1))
+ ? VPInstruction::ExtractLastElement
+ : VPInstruction::ExtractLastLanePerPart;
+ auto *Ext = new VPInstruction(ExtractOpc, {Clone->getOperand(0)});
Ext->insertBefore(Clone);
Clone->setOperand(0, Ext);
RepR->eraseFromParent();
@@ -1337,7 +1351,8 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
!all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) {
return U->usesScalars(RepOrWidenR) ||
match(cast<VPRecipeBase>(U),
- m_ExtractLastElement(m_VPValue()));
+ m_CombineOr(m_ExtractLastElement(m_VPValue()),
+ m_ExtractLastLanePerPart(m_VPValue())));
}))
continue;
@@ -1530,7 +1545,7 @@ static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan,
return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, SE);
});
- auto *CanIV = Plan.getCanonicalIV();
+ auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
if (!match(Cond, m_SpecificICmp(CmpInst::ICMP_EQ,
m_Specific(CanIV->getBackedgeValue()),
m_Specific(&Plan.getVectorTripCount()))))
@@ -1658,7 +1673,7 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
auto *Term = &ExitingVPBB->back();
VPValue *Cond;
ScalarEvolution &SE = *PSE.getSE();
- if (match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) ||
+ if (match(Term, m_BranchOnCount()) ||
match(Term, m_BranchOnCond(m_Not(m_ActiveLaneMask(
m_VPValue(), m_VPValue(), m_VPValue()))))) {
// Try to simplify the branch condition if TC <= VF * UF when the latch
@@ -1909,8 +1924,7 @@ static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR,
bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
VPBuilder &LoopBuilder) {
- VPDominatorTree VPDT;
- VPDT.recalculate(Plan);
+ VPDominatorTree VPDT(Plan);
SmallVector<VPFirstOrderRecurrencePHIRecipe *> RecurrencePhis;
for (VPRecipeBase &R :
@@ -1992,6 +2006,13 @@ struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
.Case<VPWidenIntrinsicRecipe>([](auto *I) {
return std::make_pair(true, I->getVectorIntrinsicID());
})
+ .Case<VPVectorPointerRecipe>([](auto *I) {
+ // For recipes that do not directly map to LLVM IR instructions,
+ // assign opcodes after the last VPInstruction opcode (which is also
+ // after the last IR Instruction opcode), based on the VPDefID.
+ return std::make_pair(false,
+ VPInstruction::OpsEnd + 1 + I->getVPDefID());
+ })
.Default([](auto *) { return std::nullopt; });
}
@@ -2015,11 +2036,8 @@ struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
static bool canHandle(const VPSingleDefRecipe *Def) {
// We can extend the list of handled recipes in the future,
// provided we account for the data embedded in them while checking for
- // equality or hashing. We assign VPVectorEndPointerRecipe the GEP opcode,
- // as it is essentially a GEP with different semantics.
- auto C = isa<VPVectorPointerRecipe>(Def)
- ? std::make_pair(false, Instruction::GetElementPtr)
- : getOpcodeOrIntrinsicID(Def);
+ // equality or hashing.
+ auto C = getOpcodeOrIntrinsicID(Def);
// The issue with (Insert|Extract)Value is that the index of the
// insert/extract is not a proper operand in LLVM IR, and hence also not in
@@ -2058,6 +2076,8 @@ struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
vputils::isSingleScalar(L) != vputils::isSingleScalar(R) ||
!equal(L->operands(), R->operands()))
return false;
+ assert(getOpcodeOrIntrinsicID(L) && getOpcodeOrIntrinsicID(R) &&
+ "must have valid opcode info for both recipes");
if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
if (LFlags->hasPredicate() &&
LFlags->getPredicate() !=
@@ -2102,9 +2122,18 @@ static void licm(VPlan &Plan) {
VPBasicBlock *Preheader = Plan.getVectorPreheader();
// Return true if we do not know how to (mechanically) hoist a given recipe
- // out of a loop region. Does not address legality concerns such as aliasing
- // or speculation safety.
+ // out of a loop region.
auto CannotHoistRecipe = [](VPRecipeBase &R) {
+ // Assumes don't alias anything or throw; as long as they're guaranteed to
+ // execute, they're safe to hoist.
+ if (match(&R, m_Intrinsic<Intrinsic::assume>()))
+ return false;
+
+ // TODO: Relax checks in the future, e.g. we could also hoist reads, if
+ // their memory location is not modified in the vector loop.
+ if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
+ return true;
+
// Allocas cannot be hoisted.
auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
return RepR && RepR->getOpcode() == Instruction::Alloca;
@@ -2112,17 +2141,18 @@ static void licm(VPlan &Plan) {
// Hoist any loop invariant recipes from the vector loop region to the
// preheader. Preform a shallow traversal of the vector loop region, to
- // exclude recipes in replicate regions.
+ // exclude recipes in replicate regions. Since the top-level blocks in the
+ // vector loop region are guaranteed to execute if the vector pre-header is,
+ // we don't need to check speculation safety.
VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+ assert(Preheader->getSingleSuccessor() == LoopRegion &&
+ "Expected vector prehader's successor to be the vector loop region");
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_shallow(LoopRegion->getEntry()))) {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
if (CannotHoistRecipe(R))
continue;
- // TODO: Relax checks in the future, e.g. we could also hoist reads, if
- // their memory location is not modified in the vector loop.
- if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi() ||
- any_of(R.operands(), [](VPValue *Op) {
+ if (any_of(R.operands(), [](VPValue *Op) {
return !Op->isDefinedOutsideLoopRegions();
}))
continue;
@@ -2314,7 +2344,7 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) {
VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
- auto *CanonicalIVPHI = Plan.getCanonicalIV();
+ auto *CanonicalIVPHI = TopRegion->getCanonicalIV();
VPValue *StartV = CanonicalIVPHI->getStartValue();
auto *CanonicalIVIncrement =
@@ -2353,7 +2383,7 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
// Create the active lane mask instruction in the VPlan preheader.
VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
- ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
+ ConstantInt::get(TopRegion->getCanonicalIV()->getScalarType(), 1));
auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
{EntryIncrement, TC, ALMMultiplier}, DL,
"active.lane.mask.entry");
@@ -2389,13 +2419,15 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
/// TODO: Introduce explicit recipe for header-mask instead of searching
/// for the header-mask pattern manually.
static VPSingleDefRecipe *findHeaderMask(VPlan &Plan) {
+ VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
SmallVector<VPValue *> WideCanonicalIVs;
- auto *FoundWidenCanonicalIVUser = find_if(Plan.getCanonicalIV()->users(),
- IsaPred<VPWidenCanonicalIVRecipe>);
- assert(count_if(Plan.getCanonicalIV()->users(),
+ auto *FoundWidenCanonicalIVUser = find_if(
+ LoopRegion->getCanonicalIV()->users(), IsaPred<VPWidenCanonicalIVRecipe>);
+ assert(count_if(LoopRegion->getCanonicalIV()->users(),
IsaPred<VPWidenCanonicalIVRecipe>) <= 1 &&
"Must have at most one VPWideCanonicalIVRecipe");
- if (FoundWidenCanonicalIVUser != Plan.getCanonicalIV()->users().end()) {
+ if (FoundWidenCanonicalIVUser !=
+ LoopRegion->getCanonicalIV()->users().end()) {
auto *WideCanonicalIV =
cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
WideCanonicalIVs.push_back(WideCanonicalIV);
@@ -2403,7 +2435,7 @@ static VPSingleDefRecipe *findHeaderMask(VPlan &Plan) {
// Also include VPWidenIntOrFpInductionRecipes that represent a widened
// version of the canonical induction.
- VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
+ VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
if (WidenOriginalIV && WidenOriginalIV->isCanonical())
@@ -2436,8 +2468,9 @@ void VPlanTransforms::addActiveLaneMask(
"DataAndControlFlowWithoutRuntimeCheck implies "
"UseActiveLaneMaskForControlFlow");
- auto *FoundWidenCanonicalIVUser = find_if(Plan.getCanonicalIV()->users(),
- IsaPred<VPWidenCanonicalIVRecipe>);
+ VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+ auto *FoundWidenCanonicalIVUser = find_if(
+ LoopRegion->getCanonicalIV()->users(), IsaPred<VPWidenCanonicalIVRecipe>);
assert(FoundWidenCanonicalIVUser &&
"Must have widened canonical IV when tail folding!");
VPSingleDefRecipe *HeaderMask = findHeaderMask(Plan);
@@ -2450,7 +2483,7 @@ void VPlanTransforms::addActiveLaneMask(
} else {
VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
- ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
+ ConstantInt::get(LoopRegion->getCanonicalIV()->getScalarType(), 1));
LaneMask =
B.createNaryOp(VPInstruction::ActiveLaneMask,
{WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
@@ -2560,9 +2593,10 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
});
assert(all_of(Plan.getVFxUF().users(),
- [&Plan](VPUser *U) {
- return match(U, m_c_Add(m_Specific(Plan.getCanonicalIV()),
- m_Specific(&Plan.getVFxUF()))) ||
+ [&LoopRegion, &Plan](VPUser *U) {
+ return match(U,
+ m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
+ m_Specific(&Plan.getVFxUF()))) ||
isa<VPWidenPointerInductionRecipe>(U);
}) &&
"Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
@@ -2717,9 +2751,10 @@ void VPlanTransforms::addExplicitVectorLength(
VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
if (Plan.hasScalarVFOnly())
return;
- VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
+ VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+ VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
- auto *CanonicalIVPHI = Plan.getCanonicalIV();
+ auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
auto *CanIVTy = CanonicalIVPHI->getScalarType();
VPValue *StartV = CanonicalIVPHI->getStartValue();
@@ -3021,8 +3056,7 @@ void VPlanTransforms::createInterleaveGroups(
// Interleave memory: for each Interleave Group we marked earlier as relevant
// for this VPlan, replace the Recipes widening its memory instructions with a
// single VPInterleaveRecipe at its insertion point.
- VPDominatorTree VPDT;
- VPDT.recalculate(Plan);
+ VPDominatorTree VPDT(Plan);
for (const auto *IG : InterleaveGroups) {
auto *Start =
cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
@@ -3398,9 +3432,8 @@ void VPlanTransforms::handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB,
VPBuilder Builder(LatchVPBB->getTerminator());
VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
- assert(
- match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond(m_VPValue())) &&
- "Terminator must be be BranchOnCond");
+ assert(match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond()) &&
+ "Terminator must be be BranchOnCond");
VPValue *CondOfEarlyExitingVPBB =
EarlyExitingVPBB->getTerminator()->getOperand(0);
auto *CondToEarlyExit = TrueSucc == EarlyExitVPBB
@@ -3662,8 +3695,7 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
return;
#ifndef NDEBUG
- VPDominatorTree VPDT;
- VPDT.recalculate(Plan);
+ VPDominatorTree VPDT(Plan);
#endif
SmallVector<VPValue *> VPValues;
@@ -4009,8 +4041,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
unsigned VFMinVal = VF.getKnownMinValue();
SmallVector<VPInterleaveRecipe *> StoreGroups;
for (auto &R : *VectorLoop->getEntryBasicBlock()) {
- if (isa<VPCanonicalIVPHIRecipe>(&R) ||
- match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
+ if (isa<VPCanonicalIVPHIRecipe>(&R) || match(&R, m_BranchOnCount()))
continue;
if (isa<VPDerivedIVRecipe, VPScalarIVStepsRecipe>(&R) &&
@@ -4163,7 +4194,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
// Adjust induction to reflect that the transformed plan only processes one
// original iteration.
- auto *CanIV = Plan.getCanonicalIV();
+ auto *CanIV = VectorLoop->getCanonicalIV();
auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
VPBuilder PHBuilder(Plan.getVectorPreheader());
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index 5e7f19f..5aeda3e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -69,7 +69,8 @@ class UnrollState {
VPBasicBlock::iterator InsertPtForPhi);
VPValue *getConstantVPV(unsigned Part) {
- Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType();
+ Type *CanIVIntTy =
+ Plan.getVectorLoopRegion()->getCanonicalIV()->getScalarType();
return Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, Part));
}
@@ -259,8 +260,7 @@ void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
/// Handle non-header-phi recipes.
void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
- if (match(&R, m_BranchOnCond(m_VPValue())) ||
- match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
+ if (match(&R, m_CombineOr(m_BranchOnCond(), m_BranchOnCount())))
return;
if (auto *VPI = dyn_cast<VPInstruction>(&R)) {
@@ -352,8 +352,7 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
// Compute*Result which combine all parts to compute the final value.
VPValue *Op1;
if (match(&R, m_VPInstruction<VPInstruction::AnyOf>(m_VPValue(Op1))) ||
- match(&R, m_VPInstruction<VPInstruction::FirstActiveLane>(
- m_VPValue(Op1))) ||
+ match(&R, m_FirstActiveLane(m_VPValue(Op1))) ||
match(&R, m_VPInstruction<VPInstruction::ComputeAnyOfResult>(
m_VPValue(), m_VPValue(), m_VPValue(Op1))) ||
match(&R, m_VPInstruction<VPInstruction::ComputeReductionResult>(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 66748c5..8b1b0e5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -53,7 +53,7 @@ VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr) {
return Expanded;
}
-bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) {
+bool vputils::isHeaderMask(const VPValue *V, const VPlan &Plan) {
if (isa<VPActiveLaneMaskPHIRecipe>(V))
return true;
@@ -67,12 +67,14 @@ bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) {
if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B), m_One())))
return B == Plan.getTripCount() &&
- (match(A, m_ScalarIVSteps(m_Specific(Plan.getCanonicalIV()), m_One(),
- m_Specific(&Plan.getVF()))) ||
+ (match(A,
+ m_ScalarIVSteps(
+ m_Specific(Plan.getVectorLoopRegion()->getCanonicalIV()),
+ m_One(), m_Specific(&Plan.getVF()))) ||
IsWideCanonicalIV(A));
return match(V, m_ICmp(m_VPValue(A), m_VPValue(B))) && IsWideCanonicalIV(A) &&
- B == Plan.getOrCreateBackedgeTakenCount();
+ B == Plan.getBackedgeTakenCount();
}
const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) {
@@ -102,7 +104,8 @@ bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) {
return all_of(R->operands(), isUniformAcrossVFsAndUFs);
}
- auto *CanonicalIV = R->getParent()->getPlan()->getCanonicalIV();
+ auto *CanonicalIV =
+ R->getParent()->getEnclosingLoopRegion()->getCanonicalIV();
// Canonical IV chain is uniform.
if (V == CanonicalIV || V == CanonicalIV->getBackedgeValue())
return true;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index 0222b0a..cf95ac0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -90,7 +90,7 @@ inline bool isSingleScalar(const VPValue *VPV) {
}
/// Return true if \p V is a header mask in \p Plan.
-bool isHeaderMask(const VPValue *V, VPlan &Plan);
+bool isHeaderMask(const VPValue *V, const VPlan &Plan);
/// Checks if \p V is uniform across all VF lanes and UF parts. It is considered
/// as such if it is either loop invariant (defined outside the vector region)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 013ea2e..91734a1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -24,6 +24,7 @@
#define DEBUG_TYPE "loop-vectorize"
using namespace llvm;
+using namespace VPlanPatternMatch;
namespace {
class VPlanVerifier {
@@ -198,7 +199,6 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
}
// EVLIVIncrement is only used by EVLIV & BranchOnCount.
// Having more than two users is unexpected.
- using namespace llvm::VPlanPatternMatch;
if (I->getOpcode() != VPInstruction::Broadcast &&
I->getNumUsers() != 1 &&
(I->getNumUsers() != 2 ||
@@ -298,11 +298,16 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) {
return false;
}
}
- if (const auto *EVL = dyn_cast<VPInstruction>(&R)) {
- if (EVL->getOpcode() == VPInstruction::ExplicitVectorLength &&
- !verifyEVLRecipe(*EVL)) {
- errs() << "EVL VPValue is not used correctly\n";
- return false;
+ if (const auto *VPI = dyn_cast<VPInstruction>(&R)) {
+ switch (VPI->getOpcode()) {
+ case VPInstruction::ExplicitVectorLength:
+ if (!verifyEVLRecipe(*VPI)) {
+ errs() << "EVL VPValue is not used correctly\n";
+ return false;
+ }
+ break;
+ default:
+ break;
}
}
}
@@ -479,8 +484,7 @@ bool VPlanVerifier::verify(const VPlan &Plan) {
}
auto *LastInst = dyn_cast<VPInstruction>(std::prev(Exiting->end()));
- if (!LastInst || (LastInst->getOpcode() != VPInstruction::BranchOnCount &&
- LastInst->getOpcode() != VPInstruction::BranchOnCond)) {
+ if (!match(LastInst, m_CombineOr(m_BranchOnCond(), m_BranchOnCount()))) {
errs() << "VPlan vector loop exit must end with BranchOnCount or "
"BranchOnCond VPInstruction\n";
return false;
@@ -490,8 +494,7 @@ bool VPlanVerifier::verify(const VPlan &Plan) {
}
bool llvm::verifyVPlanIsValid(const VPlan &Plan, bool VerifyLate) {
- VPDominatorTree VPDT;
- VPDT.recalculate(const_cast<VPlan &>(Plan));
+ VPDominatorTree VPDT(const_cast<VPlan &>(Plan));
VPTypeAnalysis TypeInfo(Plan);
VPlanVerifier Verifier(VPDT, TypeInfo, VerifyLate);
return Verifier.verify(Plan);
diff --git a/llvm/lib/XRay/BlockIndexer.cpp b/llvm/lib/XRay/BlockIndexer.cpp
index f4ba0eb..d0c6853 100644
--- a/llvm/lib/XRay/BlockIndexer.cpp
+++ b/llvm/lib/XRay/BlockIndexer.cpp
@@ -12,8 +12,8 @@
//===----------------------------------------------------------------------===//
#include "llvm/XRay/BlockIndexer.h"
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
Error BlockIndexer::visit(BufferExtents &) { return Error::success(); }
@@ -89,6 +89,3 @@ Error BlockIndexer::flush() {
CurrentBlock.WallclockTime = nullptr;
return Error::success();
}
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/BlockPrinter.cpp b/llvm/lib/XRay/BlockPrinter.cpp
index 63a60c3..d85be5b 100644
--- a/llvm/lib/XRay/BlockPrinter.cpp
+++ b/llvm/lib/XRay/BlockPrinter.cpp
@@ -7,8 +7,8 @@
//===----------------------------------------------------------------------===//
#include "llvm/XRay/BlockPrinter.h"
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
Error BlockPrinter::visit(BufferExtents &R) {
OS << "\n[New Block]\n";
@@ -108,6 +108,3 @@ Error BlockPrinter::visit(EndBufferRecord &R) {
auto E = RP.visit(R);
return E;
}
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/BlockVerifier.cpp b/llvm/lib/XRay/BlockVerifier.cpp
index 99f255e..e39f6b6 100644
--- a/llvm/lib/XRay/BlockVerifier.cpp
+++ b/llvm/lib/XRay/BlockVerifier.cpp
@@ -10,19 +10,18 @@
#include <bitset>
-namespace llvm {
-namespace xray {
-namespace {
+using namespace llvm;
+using namespace llvm::xray;
-constexpr unsigned long long mask(BlockVerifier::State S) {
+static constexpr unsigned long long mask(BlockVerifier::State S) {
return 1uLL << static_cast<std::size_t>(S);
}
-constexpr std::size_t number(BlockVerifier::State S) {
+static constexpr std::size_t number(BlockVerifier::State S) {
return static_cast<std::size_t>(S);
}
-StringRef recordToString(BlockVerifier::State R) {
+static StringRef recordToString(BlockVerifier::State R) {
switch (R) {
case BlockVerifier::State::BufferExtents:
return "BufferExtents";
@@ -53,6 +52,8 @@ StringRef recordToString(BlockVerifier::State R) {
llvm_unreachable("Unkown state!");
}
+namespace {
+
struct Transition {
BlockVerifier::State From;
std::bitset<number(BlockVerifier::State::StateMax)> ToStates;
@@ -133,7 +134,7 @@ Error BlockVerifier::transition(State To) {
CurrentRecord = To;
return Error::success();
-} // namespace xray
+}
Error BlockVerifier::visit(BufferExtents &) {
return transition(State::BufferExtents);
@@ -201,6 +202,3 @@ Error BlockVerifier::verify() {
}
void BlockVerifier::reset() { CurrentRecord = State::Unknown; }
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/FDRRecordProducer.cpp b/llvm/lib/XRay/FDRRecordProducer.cpp
index 479b710..0f4eed1 100644
--- a/llvm/lib/XRay/FDRRecordProducer.cpp
+++ b/llvm/lib/XRay/FDRRecordProducer.cpp
@@ -10,8 +10,8 @@
#include <cstdint>
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
namespace {
@@ -31,8 +31,9 @@ enum MetadataRecordKinds : uint8_t {
// This is an end marker, used to identify the upper bound for this enum.
EnumEndMarker,
};
+} // namespace
-Expected<std::unique_ptr<Record>>
+static Expected<std::unique_ptr<Record>>
metadataRecordType(const XRayFileHeader &Header, uint8_t T) {
if (T >= static_cast<uint8_t>(MetadataRecordKinds::EnumEndMarker))
@@ -72,12 +73,10 @@ metadataRecordType(const XRayFileHeader &Header, uint8_t T) {
llvm_unreachable("Unhandled MetadataRecordKinds enum value");
}
-constexpr bool isMetadataIntroducer(uint8_t FirstByte) {
+static constexpr bool isMetadataIntroducer(uint8_t FirstByte) {
return FirstByte & 0x01u;
}
-} // namespace
-
Expected<std::unique_ptr<Record>>
FileBasedRecordProducer::findNextBufferExtent() {
// We seek one byte at a time until we find a suitable buffer extents metadata
@@ -193,6 +192,3 @@ Expected<std::unique_ptr<Record>> FileBasedRecordProducer::produce() {
assert(R != nullptr);
return std::move(R);
}
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/FDRRecords.cpp b/llvm/lib/XRay/FDRRecords.cpp
index ff315d3..a18f733 100644
--- a/llvm/lib/XRay/FDRRecords.cpp
+++ b/llvm/lib/XRay/FDRRecords.cpp
@@ -12,8 +12,8 @@
//===----------------------------------------------------------------------===//
#include "llvm/XRay/FDRRecords.h"
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
Error BufferExtents::apply(RecordVisitor &V) { return V.visit(*this); }
Error WallclockRecord::apply(RecordVisitor &V) { return V.visit(*this); }
@@ -61,6 +61,3 @@ StringRef Record::kindToString(RecordKind K) {
}
return "Unknown";
}
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/FDRTraceExpander.cpp b/llvm/lib/XRay/FDRTraceExpander.cpp
index b68e997..991e6e5 100644
--- a/llvm/lib/XRay/FDRTraceExpander.cpp
+++ b/llvm/lib/XRay/FDRTraceExpander.cpp
@@ -7,8 +7,8 @@
//===----------------------------------------------------------------------===//
#include "llvm/XRay/FDRTraceExpander.h"
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
void TraceExpander::resetCurrentRecord() {
if (BuildingRecord)
@@ -126,6 +126,3 @@ Error TraceExpander::flush() {
resetCurrentRecord();
return Error::success();
}
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/FDRTraceWriter.cpp b/llvm/lib/XRay/FDRTraceWriter.cpp
index fb59125..3e320a6 100644
--- a/llvm/lib/XRay/FDRTraceWriter.cpp
+++ b/llvm/lib/XRay/FDRTraceWriter.cpp
@@ -12,8 +12,8 @@
#include "llvm/XRay/FDRTraceWriter.h"
#include <tuple>
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
namespace {
@@ -37,9 +37,10 @@ template <size_t Index> struct IndexedWriter {
return 0;
}
};
+} // namespace
template <uint8_t Kind, class... Values>
-Error writeMetadata(support::endian::Writer &OS, Values &&... Ds) {
+static Error writeMetadata(support::endian::Writer &OS, Values &&...Ds) {
// The first bit in the first byte of metadata records is always set to 1, so
// we ensure this is the case when we write out the first byte of the record.
uint8_t FirstByte = (static_cast<uint8_t>(Kind) << 1) | uint8_t{0x01u};
@@ -54,8 +55,6 @@ Error writeMetadata(support::endian::Writer &OS, Values &&... Ds) {
return Error::success();
}
-} // namespace
-
FDRTraceWriter::FDRTraceWriter(raw_ostream &O, const XRayFileHeader &H)
: OS(O, llvm::endianness::native) {
// We need to re-construct a header, by writing the fields we care about for
@@ -146,6 +145,3 @@ Error FDRTraceWriter::visit(FunctionRecord &R) {
OS.write(R.delta());
return Error::success();
}
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/FileHeaderReader.cpp b/llvm/lib/XRay/FileHeaderReader.cpp
index 6b6daf9..681cef7 100644
--- a/llvm/lib/XRay/FileHeaderReader.cpp
+++ b/llvm/lib/XRay/FileHeaderReader.cpp
@@ -7,12 +7,13 @@
//===----------------------------------------------------------------------===//
#include "llvm/XRay/FileHeaderReader.h"
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
// Populates the FileHeader reference by reading the first 32 bytes of the file.
-Expected<XRayFileHeader> readBinaryFormatHeader(DataExtractor &HeaderExtractor,
- uint64_t &OffsetPtr) {
+Expected<XRayFileHeader>
+xray::readBinaryFormatHeader(DataExtractor &HeaderExtractor,
+ uint64_t &OffsetPtr) {
// FIXME: Maybe deduce whether the data is little or big-endian using some
// magic bytes in the beginning of the file?
@@ -68,6 +69,3 @@ Expected<XRayFileHeader> readBinaryFormatHeader(DataExtractor &HeaderExtractor,
OffsetPtr += 16;
return std::move(FileHeader);
}
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/LogBuilderConsumer.cpp b/llvm/lib/XRay/LogBuilderConsumer.cpp
index ffb49f9..f0fc336 100644
--- a/llvm/lib/XRay/LogBuilderConsumer.cpp
+++ b/llvm/lib/XRay/LogBuilderConsumer.cpp
@@ -7,8 +7,8 @@
//===----------------------------------------------------------------------===//
#include "llvm/XRay/FDRRecordConsumer.h"
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
Error LogBuilderConsumer::consume(std::unique_ptr<Record> R) {
if (!R)
@@ -32,6 +32,3 @@ Error PipelineConsumer::consume(std::unique_ptr<Record> R) {
Result = joinErrors(std::move(Result), R->apply(*V));
return Result;
}
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/Profile.cpp b/llvm/lib/XRay/Profile.cpp
index 1b340e5..ecb767b 100644
--- a/llvm/lib/XRay/Profile.cpp
+++ b/llvm/lib/XRay/Profile.cpp
@@ -18,8 +18,8 @@
#include "llvm/XRay/Trace.h"
#include <memory>
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
Profile::Profile(const Profile &O) {
// We need to re-create all the tries from the original (O), into the current
@@ -46,6 +46,7 @@ struct BlockHeader {
uint32_t Number;
uint64_t Thread;
};
+} // namespace
static Expected<BlockHeader> readBlockHeader(DataExtractor &Extractor,
uint64_t &Offset) {
@@ -115,8 +116,6 @@ static Expected<Profile::Data> readData(DataExtractor &Extractor,
return D;
}
-} // namespace
-
Error Profile::addBlock(Block &&B) {
if (B.PathData.empty())
return make_error<StringError>(
@@ -189,7 +188,7 @@ Profile::PathID Profile::internPath(ArrayRef<FuncID> P) {
return Node->ID;
}
-Profile mergeProfilesByThread(const Profile &L, const Profile &R) {
+Profile xray::mergeProfilesByThread(const Profile &L, const Profile &R) {
Profile Merged;
using PathDataMap = DenseMap<Profile::PathID, Profile::Data>;
using PathDataMapPtr = std::unique_ptr<PathDataMap>;
@@ -228,7 +227,7 @@ Profile mergeProfilesByThread(const Profile &L, const Profile &R) {
return Merged;
}
-Profile mergeProfilesByStack(const Profile &L, const Profile &R) {
+Profile xray::mergeProfilesByStack(const Profile &L, const Profile &R) {
Profile Merged;
using PathDataMap = DenseMap<Profile::PathID, Profile::Data>;
PathDataMap PathData;
@@ -258,7 +257,7 @@ Profile mergeProfilesByStack(const Profile &L, const Profile &R) {
return Merged;
}
-Expected<Profile> loadProfile(StringRef Filename) {
+Expected<Profile> xray::loadProfile(StringRef Filename) {
Expected<sys::fs::file_t> FdOrErr = sys::fs::openNativeFileForRead(Filename);
if (!FdOrErr)
return FdOrErr.takeError();
@@ -322,7 +321,7 @@ struct StackEntry {
} // namespace
-Expected<Profile> profileFromTrace(const Trace &T) {
+Expected<Profile> xray::profileFromTrace(const Trace &T) {
Profile P;
// The implementation of the algorithm re-creates the execution of
@@ -397,6 +396,3 @@ Expected<Profile> profileFromTrace(const Trace &T) {
return P;
}
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/RecordInitializer.cpp b/llvm/lib/XRay/RecordInitializer.cpp
index 68ab3db..83d5f14 100644
--- a/llvm/lib/XRay/RecordInitializer.cpp
+++ b/llvm/lib/XRay/RecordInitializer.cpp
@@ -7,8 +7,8 @@
//===----------------------------------------------------------------------===//
#include "llvm/XRay/FDRRecords.h"
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
Error RecordInitializer::visit(BufferExtents &R) {
if (!E.isValidOffsetForDataOfSize(OffsetPtr, sizeof(uint64_t)))
@@ -426,6 +426,3 @@ Error RecordInitializer::visit(FunctionRecord &R) {
assert(FunctionRecord::kFunctionRecordSize == (OffsetPtr - BeginOffset));
return Error::success();
}
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/RecordPrinter.cpp b/llvm/lib/XRay/RecordPrinter.cpp
index 32d4210..b9b7a16 100644
--- a/llvm/lib/XRay/RecordPrinter.cpp
+++ b/llvm/lib/XRay/RecordPrinter.cpp
@@ -9,8 +9,8 @@
#include "llvm/Support/FormatVariadic.h"
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
Error RecordPrinter::visit(BufferExtents &R) {
OS << formatv("<Buffer: size = {0} bytes>", R.size()) << Delim;
@@ -103,6 +103,3 @@ Error RecordPrinter::visit(FunctionRecord &R) {
OS << Delim;
return Error::success();
}
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/Trace.cpp b/llvm/lib/XRay/Trace.cpp
index 74515b1..14a3f01 100644
--- a/llvm/lib/XRay/Trace.cpp
+++ b/llvm/lib/XRay/Trace.cpp
@@ -29,11 +29,9 @@ using namespace llvm;
using namespace llvm::xray;
using llvm::yaml::Input;
-namespace {
-
-Error loadNaiveFormatLog(StringRef Data, bool IsLittleEndian,
- XRayFileHeader &FileHeader,
- std::vector<XRayRecord> &Records) {
+static Error loadNaiveFormatLog(StringRef Data, bool IsLittleEndian,
+ XRayFileHeader &FileHeader,
+ std::vector<XRayRecord> &Records) {
if (Data.size() < 32)
return make_error<StringError>(
"Not enough bytes for an XRay log.",
@@ -265,8 +263,9 @@ Error loadNaiveFormatLog(StringRef Data, bool IsLittleEndian,
/// what FunctionRecord instances use, and we no longer need to include the CPU
/// id in the CustomEventRecord.
///
-Error loadFDRLog(StringRef Data, bool IsLittleEndian,
- XRayFileHeader &FileHeader, std::vector<XRayRecord> &Records) {
+static Error loadFDRLog(StringRef Data, bool IsLittleEndian,
+ XRayFileHeader &FileHeader,
+ std::vector<XRayRecord> &Records) {
if (Data.size() < 32)
return createStringError(std::make_error_code(std::errc::invalid_argument),
@@ -348,8 +347,8 @@ Error loadFDRLog(StringRef Data, bool IsLittleEndian,
return Error::success();
}
-Error loadYAMLLog(StringRef Data, XRayFileHeader &FileHeader,
- std::vector<XRayRecord> &Records) {
+static Error loadYAMLLog(StringRef Data, XRayFileHeader &FileHeader,
+ std::vector<XRayRecord> &Records) {
YAMLXRayTrace Trace;
Input In(Data);
In >> Trace;
@@ -376,7 +375,6 @@ Error loadYAMLLog(StringRef Data, XRayFileHeader &FileHeader,
});
return Error::success();
}
-} // namespace
Expected<Trace> llvm::xray::loadTraceFile(StringRef Filename, bool Sort) {
Expected<sys::fs::file_t> FdOrErr = sys::fs::openNativeFileForRead(Filename);
diff --git a/llvm/test/Analysis/BasicAA/intrinsics.ll b/llvm/test/Analysis/BasicAA/intrinsics.ll
index f8b30df..56d762b 100644
--- a/llvm/test/Analysis/BasicAA/intrinsics.ll
+++ b/llvm/test/Analysis/BasicAA/intrinsics.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
; RUN: opt -aa-pipeline=basic-aa -passes=gvn -S < %s | FileCheck %s
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
@@ -5,12 +6,15 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-
; BasicAA should prove that these calls don't interfere, since they are
; IntrArgReadMem and have noalias pointers.
-; CHECK: define <8 x i16> @test0(ptr noalias %p, ptr noalias %q, <8 x i16> %y, <8 x i1> %m, <8 x i16> %pt) {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: %a = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %p, i32 16, <8 x i1> %m, <8 x i16> %pt) [[ATTR:#[0-9]+]]
-; CHECK-NEXT: call void @llvm.masked.store.v8i16.p0(<8 x i16> %y, ptr %q, i32 16, <8 x i1> %m)
-; CHECK-NEXT: %c = add <8 x i16> %a, %a
define <8 x i16> @test0(ptr noalias %p, ptr noalias %q, <8 x i16> %y, <8 x i1> %m, <8 x i16> %pt) {
+; CHECK-LABEL: define <8 x i16> @test0(
+; CHECK-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]], <8 x i16> [[Y:%.*]], <8 x i1> [[M:%.*]], <8 x i16> [[PT:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[A:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[P]], i32 16, <8 x i1> [[M]], <8 x i16> [[PT]]) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT: call void @llvm.masked.store.v8i16.p0(<8 x i16> [[Y]], ptr [[Q]], i32 16, <8 x i1> [[M]])
+; CHECK-NEXT: [[C:%.*]] = add <8 x i16> [[A]], [[A]]
+; CHECK-NEXT: ret <8 x i16> [[C]]
+;
entry:
%a = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %p, i32 16, <8 x i1> %m, <8 x i16> %pt) nounwind
call void @llvm.masked.store.v8i16.p0(<8 x i16> %y, ptr %q, i32 16, <8 x i1> %m)
@@ -24,4 +28,3 @@ declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32, <8 x i1>) nounwind
; CHECK: attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
; CHECK: attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) }
-; CHECK: attributes [[ATTR]] = { nounwind }
diff --git a/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll b/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll
index 7e980c9..ffd8259 100644
--- a/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll
+++ b/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll
@@ -1,10 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
; RUN: opt < %s -aa-pipeline=basic-aa -passes=dse -S | FileCheck %s
define <vscale x 4 x float> @dead_scalable_store(ptr %0) {
; CHECK-LABEL: define <vscale x 4 x float> @dead_scalable_store(
-; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
-; CHECK-NOT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.32, ptr nonnull %gep.arr.32, i32 1, <vscale x 4 x i1> %mask)
-; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
+; CHECK-NEXT: [[ARR:%.*]] = alloca [64 x i32], align 4
+; CHECK-NEXT: [[MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
+; CHECK-NEXT: [[GEP_0_48:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 48
+; CHECK-NEXT: [[GEP_ARR_16:%.*]] = getelementptr inbounds nuw i8, ptr [[ARR]], i64 16
+; CHECK-NEXT: [[GEP_ARR_48:%.*]] = getelementptr inbounds nuw i8, ptr [[ARR]], i64 48
+; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[GEP_0_16]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0_16]], ptr nonnull [[GEP_ARR_16]], i32 1, <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT: [[LOAD_0_48:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[GEP_0_48]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0_48]], ptr nonnull [[GEP_ARR_48]], i32 1, <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT: [[FADDOP0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[GEP_ARR_16]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT: [[FADDOP1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[GEP_ARR_48]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT: [[FADD:%.*]] = fadd <vscale x 4 x float> [[FADDOP0]], [[FADDOP1]]
+; CHECK-NEXT: ret <vscale x 4 x float> [[FADD]]
;
%arr = alloca [64 x i32], align 4
%mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
@@ -34,9 +47,21 @@ define <vscale x 4 x float> @dead_scalable_store(ptr %0) {
define <4 x float> @dead_scalable_store_fixed(ptr %0) {
; CHECK-LABEL: define <4 x float> @dead_scalable_store_fixed(
-; CHECK: call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <4 x i1> %mask)
-; CHECK-NOT: call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.32, ptr nonnull %gep.arr.36, i32 1, <4 x i1> %mask2)
-; CHECK: call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <4 x i1> %mask)
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
+; CHECK-NEXT: [[ARR:%.*]] = alloca [64 x i32], align 4
+; CHECK-NEXT: [[MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 4)
+; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
+; CHECK-NEXT: [[GEP_0_48:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 48
+; CHECK-NEXT: [[GEP_ARR_16:%.*]] = getelementptr inbounds nuw i8, ptr [[ARR]], i64 16
+; CHECK-NEXT: [[GEP_ARR_48:%.*]] = getelementptr inbounds nuw i8, ptr [[ARR]], i64 48
+; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull [[GEP_0_16]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[LOAD_0_16]], ptr nonnull [[GEP_ARR_16]], i32 1, <4 x i1> [[MASK]])
+; CHECK-NEXT: [[LOAD_0_48:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull [[GEP_0_48]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[LOAD_0_48]], ptr nonnull [[GEP_ARR_48]], i32 1, <4 x i1> [[MASK]])
+; CHECK-NEXT: [[FADDOP0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull [[GEP_ARR_16]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; CHECK-NEXT: [[FADDOP1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull [[GEP_ARR_48]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; CHECK-NEXT: [[FADD:%.*]] = fadd <4 x float> [[FADDOP0]], [[FADDOP1]]
+; CHECK-NEXT: ret <4 x float> [[FADD]]
;
%arr = alloca [64 x i32], align 4
%mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 4)
@@ -67,9 +92,25 @@ define <4 x float> @dead_scalable_store_fixed(ptr %0) {
define <vscale x 4 x float> @scalable_store_partial_overwrite(ptr %0) {
; CHECK-LABEL: define <vscale x 4 x float> @scalable_store_partial_overwrite(
-; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
-; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.30, ptr nonnull %gep.arr.30, i32 1, <vscale x 4 x i1> %mask)
-; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
+; CHECK-NEXT: [[ARR:%.*]] = alloca [64 x i32], align 4
+; CHECK-NEXT: [[MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
+; CHECK-NEXT: [[GEP_0_30:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 30
+; CHECK-NEXT: [[GEP_0_48:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 48
+; CHECK-NEXT: [[GEP_ARR_16:%.*]] = getelementptr inbounds nuw i8, ptr [[ARR]], i64 16
+; CHECK-NEXT: [[GEP_ARR_30:%.*]] = getelementptr inbounds nuw i8, ptr [[ARR]], i64 30
+; CHECK-NEXT: [[GEP_ARR_48:%.*]] = getelementptr inbounds nuw i8, ptr [[ARR]], i64 48
+; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[GEP_0_16]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0_16]], ptr nonnull [[GEP_ARR_16]], i32 1, <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT: [[LOAD_0_30:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[GEP_0_30]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0_30]], ptr nonnull [[GEP_ARR_30]], i32 1, <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT: [[LOAD_0_48:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[GEP_0_48]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0_48]], ptr nonnull [[GEP_ARR_48]], i32 1, <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT: [[FADDOP0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[GEP_ARR_16]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT: [[FADDOP1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[GEP_ARR_48]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT: [[FADD:%.*]] = fadd <vscale x 4 x float> [[FADDOP0]], [[FADDOP1]]
+; CHECK-NEXT: ret <vscale x 4 x float> [[FADD]]
;
%arr = alloca [64 x i32], align 4
%mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
@@ -99,9 +140,23 @@ define <vscale x 4 x float> @scalable_store_partial_overwrite(ptr %0) {
define <vscale x 4 x float> @dead_scalable_store_small_mask(ptr %0) {
; CHECK-LABEL: define <vscale x 4 x float> @dead_scalable_store_small_mask(
-; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
-; CHECK-NOT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.30, ptr nonnull %gep.arr.30, i32 1, <vscale x 4 x i1> %mask)
-; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.46, ptr nonnull %gep.arr.46, i32 1, <vscale x 4 x i1> %mask)
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
+; CHECK-NEXT: [[ARR:%.*]] = alloca [64 x i32], align 4
+; CHECK-NEXT: [[MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
+; CHECK-NEXT: [[GEP_0_46:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 46
+; CHECK-NEXT: [[GEP_ARR_16:%.*]] = getelementptr inbounds nuw i8, ptr [[ARR]], i64 16
+; CHECK-NEXT: [[GEP_ARR_46:%.*]] = getelementptr inbounds nuw i8, ptr [[ARR]], i64 46
+; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[GEP_0_16]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0_16]], ptr nonnull [[GEP_ARR_16]], i32 1, <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT: [[LOAD_0_46:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[GEP_0_46]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0_46]], ptr nonnull [[GEP_ARR_46]], i32 1, <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT: [[SMALLMASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 2)
+; CHECK-NEXT: [[FADDOP0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[GEP_ARR_16]], i32 1, <vscale x 4 x i1> [[SMALLMASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT: [[FADDOP1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[GEP_ARR_46]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT: [[FADD:%.*]] = fadd <vscale x 4 x float> [[FADDOP0]], [[FADDOP1]]
+; CHECK-NEXT: ret <vscale x 4 x float> [[FADD]]
+;
%arr = alloca [64 x i32], align 4
%mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
@@ -131,7 +186,12 @@ define <vscale x 4 x float> @dead_scalable_store_small_mask(ptr %0) {
define <vscale x 4 x float> @dead_scalar_store(ptr noalias %0, ptr %1) {
; CHECK-LABEL: define <vscale x 4 x float> @dead_scalar_store(
-; CHECK-NOT: store i32 20, ptr %gep.1.12
+; CHECK-SAME: ptr noalias [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
+; CHECK-NEXT: [[MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i128(i128 0, i128 4)
+; CHECK-NEXT: [[LOAD_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[TMP0]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0]], ptr nonnull [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT: [[RETVAL:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT: ret <vscale x 4 x float> [[RETVAL]]
;
%mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i128(i128 0, i128 4)
%gep.1.12 = getelementptr inbounds nuw i8, ptr %1, i64 12
@@ -144,10 +204,17 @@ define <vscale x 4 x float> @dead_scalar_store(ptr noalias %0, ptr %1) {
}
-; CHECK-LABEL: define <4 x float> @dead_scalable_store_fixed_large_mask(
-; CHECK-NOT: store i32 20, ptr %1
-; CHECK: store i32 50, ptr %gep.5
define <4 x float> @dead_scalable_store_fixed_large_mask(ptr noalias %0, ptr %1) {
+; CHECK-LABEL: define <4 x float> @dead_scalable_store_fixed_large_mask(
+; CHECK-SAME: ptr noalias [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
+; CHECK-NEXT: [[MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 7)
+; CHECK-NEXT: [[GEP_5:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i64 5
+; CHECK-NEXT: store i32 50, ptr [[GEP_5]], align 4
+; CHECK-NEXT: [[LOAD_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull [[TMP0]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[LOAD_0]], ptr nonnull [[TMP1]], i32 1, <4 x i1> [[MASK]])
+; CHECK-NEXT: [[RETVAL:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull [[TMP1]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; CHECK-NEXT: ret <4 x float> [[RETVAL]]
+;
%mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 7)
store i32 20, ptr %1
@@ -164,8 +231,16 @@ define <4 x float> @dead_scalable_store_fixed_large_mask(ptr noalias %0, ptr %1)
; This get active lane mask may cover 4 or 8 integers
define <vscale x 4 x float> @mask_gt_minimum_num_elts(ptr noalias %0, ptr %1) {
; CHECK-LABEL: define <vscale x 4 x float> @mask_gt_minimum_num_elts(
-; CHECK: store i32 10, ptr %gep.1.12
-; CHECK: store i32 20, ptr %gep.1.28
+; CHECK-SAME: ptr noalias [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
+; CHECK-NEXT: [[MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
+; CHECK-NEXT: [[GEP_1_12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12
+; CHECK-NEXT: store i32 10, ptr [[GEP_1_12]], align 4
+; CHECK-NEXT: [[GEP_1_28:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 28
+; CHECK-NEXT: store i32 20, ptr [[GEP_1_28]], align 4
+; CHECK-NEXT: [[LOAD_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[TMP0]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0]], ptr nonnull [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT: [[RETVAL:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT: ret <vscale x 4 x float> [[RETVAL]]
;
%mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
%gep.1.12 = getelementptr inbounds nuw i8, ptr %1, i64 12
@@ -182,7 +257,13 @@ define <vscale x 4 x float> @mask_gt_minimum_num_elts(ptr noalias %0, ptr %1) {
; Don't do anything if the mask's Op1 < Op0
define <vscale x 4 x float> @active_lane_mask_lt(ptr noalias %0, ptr %1) {
; CHECK-LABEL: define <vscale x 4 x float> @active_lane_mask_lt(
-; CHECK: store i32 20, ptr %1
+; CHECK-SAME: ptr noalias [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
+; CHECK-NEXT: [[MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 4, i32 2)
+; CHECK-NEXT: store i32 20, ptr [[TMP1]], align 4
+; CHECK-NEXT: [[LOAD_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[TMP0]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0]], ptr nonnull [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT: [[RETVAL:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT: ret <vscale x 4 x float> [[RETVAL]]
;
%mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 4, i32 2)
store i32 20, ptr %1
@@ -196,7 +277,13 @@ define <vscale x 4 x float> @active_lane_mask_lt(ptr noalias %0, ptr %1) {
; Don't do anything if the mask's Op1 == Op0
define <vscale x 4 x float> @active_lane_mask_eq(ptr noalias %0, ptr %1) {
; CHECK-LABEL: define <vscale x 4 x float> @active_lane_mask_eq(
-; CHECK: store i32 20, ptr %1
+; CHECK-SAME: ptr noalias [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
+; CHECK-NEXT: [[MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 2, i32 2)
+; CHECK-NEXT: store i32 20, ptr [[TMP1]], align 4
+; CHECK-NEXT: [[LOAD_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[TMP0]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0]], ptr nonnull [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT: [[RETVAL:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT: ret <vscale x 4 x float> [[RETVAL]]
;
%mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 2, i32 2)
store i32 20, ptr %1
@@ -209,8 +296,14 @@ define <vscale x 4 x float> @active_lane_mask_eq(ptr noalias %0, ptr %1) {
define <vscale x 16 x i8> @scalar_stores_small_mask(ptr noalias %0, ptr %1) {
; CHECK-LABEL: define <vscale x 16 x i8> @scalar_stores_small_mask(
-; CHECK-NOT: store i8 60, ptr %gep.1.6
-; CHECK: store i8 120, ptr %gep.1.8
+; CHECK-SAME: ptr noalias [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
+; CHECK-NEXT: [[MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i8(i8 0, i8 7)
+; CHECK-NEXT: [[GEP_1_8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 8
+; CHECK-NEXT: store i8 120, ptr [[GEP_1_8]], align 1
+; CHECK-NEXT: [[LOAD_0:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nonnull [[TMP0]], i32 1, <vscale x 16 x i1> [[MASK]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT: call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[LOAD_0]], ptr [[TMP1]], i32 1, <vscale x 16 x i1> [[MASK]])
+; CHECK-NEXT: [[RETVAL:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP1]], i32 1, <vscale x 16 x i1> [[MASK]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT: ret <vscale x 16 x i8> [[RETVAL]]
;
%mask = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i8.i8(i8 0, i8 7)
%gep.1.6 = getelementptr inbounds nuw i8, ptr %1, i64 6
@@ -226,10 +319,14 @@ define <vscale x 16 x i8> @scalar_stores_small_mask(ptr noalias %0, ptr %1) {
define <vscale x 4 x float> @dead_scalar_store_offset(ptr noalias %0, ptr %1) {
; CHECK-LABEL: define <vscale x 4 x float> @dead_scalar_store_offset(
-; CHECK-NOT: store i32 10, ptr %gep.1.0
-; CHECK-NOT: store i32 20, ptr %gep.1.4
-; CHECK-NOT: store i32 30, ptr %gep.1.8
-; CHECK: store i32 40, ptr %gep.1.12
+; CHECK-SAME: ptr noalias [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
+; CHECK-NEXT: [[MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 1, i32 4)
+; CHECK-NEXT: [[GEP_1_12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12
+; CHECK-NEXT: store i32 40, ptr [[GEP_1_12]], align 4
+; CHECK-NEXT: [[LOAD_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[TMP0]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0]], ptr nonnull [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT: [[RETVAL:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT: ret <vscale x 4 x float> [[RETVAL]]
;
%mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 1, i32 4)
%gep.1.0 = getelementptr inbounds nuw i8, ptr %1, i64 0
diff --git a/llvm/test/Analysis/CostModel/AArch64/cast.ll b/llvm/test/Analysis/CostModel/AArch64/cast.ll
index 38bd98f..15d67489 100644
--- a/llvm/test/Analysis/CostModel/AArch64/cast.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/cast.ll
@@ -7,708 +7,708 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
define void @ext() {
; CHECK-LABEL: 'ext'
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r0 = sext i1 undef to i8
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r1 = zext i1 undef to i8
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r2 = sext i1 undef to i16
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r3 = zext i1 undef to i16
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r4 = sext i1 undef to i32
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r5 = zext i1 undef to i32
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r6 = sext i1 undef to i64
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r7 = zext i1 undef to i64
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r9 = sext i8 undef to i16
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r10 = zext i8 undef to i16
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r11 = sext i8 undef to i32
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r12 = zext i8 undef to i32
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r13 = sext i8 undef to i64
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r14 = zext i8 undef to i64
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r17 = sext i16 undef to i32
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r18 = zext i16 undef to i32
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r19 = sext i16 undef to i64
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r20 = zext i16 undef to i64
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r24 = sext i32 undef to i64
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r25 = zext i32 undef to i64
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %s2i8i16 = sext <2 x i8> undef to <2 x i16>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %z2i8i16 = zext <2 x i8> undef to <2 x i16>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %s2i8i32 = sext <2 x i8> undef to <2 x i32>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %z2i8i32 = zext <2 x i8> undef to <2 x i32>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %s2i8i64 = sext <2 x i8> undef to <2 x i64>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %z2i8i64 = zext <2 x i8> undef to <2 x i64>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %s2i16i32 = sext <2 x i16> undef to <2 x i32>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %z2i16i32 = zext <2 x i16> undef to <2 x i32>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %s2i16i64 = sext <2 x i16> undef to <2 x i64>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %z2i16i64 = zext <2 x i16> undef to <2 x i64>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %s2i32i64 = sext <2 x i32> undef to <2 x i64>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %z2i32i64 = zext <2 x i32> undef to <2 x i64>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %s4i8i16 = sext <4 x i8> undef to <4 x i16>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %z4i8i16 = zext <4 x i8> undef to <4 x i16>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %s4i8i32 = sext <4 x i8> undef to <4 x i32>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %z4i8i32 = zext <4 x i8> undef to <4 x i32>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s4i8i64 = sext <4 x i8> undef to <4 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %z4i8i64 = zext <4 x i8> undef to <4 x i64>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %s4i16i32 = sext <4 x i16> undef to <4 x i32>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %z4i16i32 = zext <4 x i16> undef to <4 x i32>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s4i16i64 = sext <4 x i16> undef to <4 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %z4i16i64 = zext <4 x i16> undef to <4 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s4i32i64 = sext <4 x i32> undef to <4 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z4i32i64 = zext <4 x i32> undef to <4 x i64>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %s8i8i16 = sext <8 x i8> undef to <8 x i16>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %z8i8i16 = zext <8 x i8> undef to <8 x i16>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i32 = sext <8 x i8> undef to <8 x i32>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %z8i8i32 = zext <8 x i8> undef to <8 x i32>
-; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i64 = sext <8 x i8> undef to <8 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %z8i8i64 = zext <8 x i8> undef to <8 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i32 = sext <8 x i16> undef to <8 x i32>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z8i16i32 = zext <8 x i16> undef to <8 x i32>
-; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i64 = sext <8 x i16> undef to <8 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %z8i16i64 = zext <8 x i16> undef to <8 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s8i32i64 = sext <8 x i32> undef to <8 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z8i32i64 = zext <8 x i32> undef to <8 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i16 = sext <16 x i8> undef to <16 x i16>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i16 = zext <16 x i8> undef to <16 x i16>
-; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i32 = sext <16 x i8> undef to <16 x i32>
-; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i32 = zext <16 x i8> undef to <16 x i32>
-; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i64 = sext <16 x i8> undef to <16 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i64 = zext <16 x i8> undef to <16 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i32 = sext <16 x i16> undef to <16 x i32>
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z16i16i32 = zext <16 x i16> undef to <16 x i32>
-; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i64 = sext <16 x i16> undef to <16 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %z16i16i64 = zext <16 x i16> undef to <16 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %s16i32i64 = sext <16 x i32> undef to <16 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %z16i32i64 = zext <16 x i32> undef to <16 x i64>
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r0 = sext i1 poison to i8
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r1 = zext i1 poison to i8
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r2 = sext i1 poison to i16
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r3 = zext i1 poison to i16
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r4 = sext i1 poison to i32
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r5 = zext i1 poison to i32
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r6 = sext i1 poison to i64
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r7 = zext i1 poison to i64
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r9 = sext i8 poison to i16
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r10 = zext i8 poison to i16
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r11 = sext i8 poison to i32
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r12 = zext i8 poison to i32
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r13 = sext i8 poison to i64
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r14 = zext i8 poison to i64
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r17 = sext i16 poison to i32
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r18 = zext i16 poison to i32
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r19 = sext i16 poison to i64
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r20 = zext i16 poison to i64
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r24 = sext i32 poison to i64
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r25 = zext i32 poison to i64
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %s2i8i16 = sext <2 x i8> poison to <2 x i16>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %z2i8i16 = zext <2 x i8> poison to <2 x i16>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %s2i8i32 = sext <2 x i8> poison to <2 x i32>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %z2i8i32 = zext <2 x i8> poison to <2 x i32>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %s2i8i64 = sext <2 x i8> poison to <2 x i64>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %z2i8i64 = zext <2 x i8> poison to <2 x i64>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %s2i16i32 = sext <2 x i16> poison to <2 x i32>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %z2i16i32 = zext <2 x i16> poison to <2 x i32>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %s2i16i64 = sext <2 x i16> poison to <2 x i64>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %z2i16i64 = zext <2 x i16> poison to <2 x i64>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %s2i32i64 = sext <2 x i32> poison to <2 x i64>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %z2i32i64 = zext <2 x i32> poison to <2 x i64>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %s4i8i16 = sext <4 x i8> poison to <4 x i16>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %z4i8i16 = zext <4 x i8> poison to <4 x i16>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %s4i8i32 = sext <4 x i8> poison to <4 x i32>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %z4i8i32 = zext <4 x i8> poison to <4 x i32>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s4i8i64 = sext <4 x i8> poison to <4 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %z4i8i64 = zext <4 x i8> poison to <4 x i64>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %s4i16i32 = sext <4 x i16> poison to <4 x i32>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %z4i16i32 = zext <4 x i16> poison to <4 x i32>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s4i16i64 = sext <4 x i16> poison to <4 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %z4i16i64 = zext <4 x i16> poison to <4 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s4i32i64 = sext <4 x i32> poison to <4 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z4i32i64 = zext <4 x i32> poison to <4 x i64>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %s8i8i16 = sext <8 x i8> poison to <8 x i16>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %z8i8i16 = zext <8 x i8> poison to <8 x i16>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i32 = sext <8 x i8> poison to <8 x i32>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %z8i8i32 = zext <8 x i8> poison to <8 x i32>
+; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i64 = sext <8 x i8> poison to <8 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %z8i8i64 = zext <8 x i8> poison to <8 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i32 = sext <8 x i16> poison to <8 x i32>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z8i16i32 = zext <8 x i16> poison to <8 x i32>
+; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i64 = sext <8 x i16> poison to <8 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %z8i16i64 = zext <8 x i16> poison to <8 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s8i32i64 = sext <8 x i32> poison to <8 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z8i32i64 = zext <8 x i32> poison to <8 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i16 = sext <16 x i8> poison to <16 x i16>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i16 = zext <16 x i8> poison to <16 x i16>
+; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i32 = sext <16 x i8> poison to <16 x i32>
+; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i32 = zext <16 x i8> poison to <16 x i32>
+; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i64 = sext <16 x i8> poison to <16 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i64 = zext <16 x i8> poison to <16 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i32 = sext <16 x i16> poison to <16 x i32>
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z16i16i32 = zext <16 x i16> poison to <16 x i32>
+; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i64 = sext <16 x i16> poison to <16 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %z16i16i64 = zext <16 x i16> poison to <16 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %s16i32i64 = sext <16 x i32> poison to <16 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %z16i32i64 = zext <16 x i32> poison to <16 x i64>
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
- %r0 = sext i1 undef to i8
- %r1 = zext i1 undef to i8
- %r2 = sext i1 undef to i16
- %r3 = zext i1 undef to i16
- %r4 = sext i1 undef to i32
- %r5 = zext i1 undef to i32
- %r6 = sext i1 undef to i64
- %r7 = zext i1 undef to i64
- %r9 = sext i8 undef to i16
- %r10 = zext i8 undef to i16
- %r11 = sext i8 undef to i32
- %r12 = zext i8 undef to i32
- %r13 = sext i8 undef to i64
- %r14 = zext i8 undef to i64
- %r17 = sext i16 undef to i32
- %r18 = zext i16 undef to i32
- %r19 = sext i16 undef to i64
- %r20 = zext i16 undef to i64
- %r24 = sext i32 undef to i64
- %r25 = zext i32 undef to i64
+ %r0 = sext i1 poison to i8
+ %r1 = zext i1 poison to i8
+ %r2 = sext i1 poison to i16
+ %r3 = zext i1 poison to i16
+ %r4 = sext i1 poison to i32
+ %r5 = zext i1 poison to i32
+ %r6 = sext i1 poison to i64
+ %r7 = zext i1 poison to i64
+ %r9 = sext i8 poison to i16
+ %r10 = zext i8 poison to i16
+ %r11 = sext i8 poison to i32
+ %r12 = zext i8 poison to i32
+ %r13 = sext i8 poison to i64
+ %r14 = zext i8 poison to i64
+ %r17 = sext i16 poison to i32
+ %r18 = zext i16 poison to i32
+ %r19 = sext i16 poison to i64
+ %r20 = zext i16 poison to i64
+ %r24 = sext i32 poison to i64
+ %r25 = zext i32 poison to i64
- %s2i8i16 = sext <2 x i8> undef to <2 x i16>
- %z2i8i16 = zext <2 x i8> undef to <2 x i16>
- %s2i8i32 = sext <2 x i8> undef to <2 x i32>
- %z2i8i32 = zext <2 x i8> undef to <2 x i32>
- %s2i8i64 = sext <2 x i8> undef to <2 x i64>
- %z2i8i64 = zext <2 x i8> undef to <2 x i64>
- %s2i16i32 = sext <2 x i16> undef to <2 x i32>
- %z2i16i32 = zext <2 x i16> undef to <2 x i32>
- %s2i16i64 = sext <2 x i16> undef to <2 x i64>
- %z2i16i64 = zext <2 x i16> undef to <2 x i64>
- %s2i32i64 = sext <2 x i32> undef to <2 x i64>
- %z2i32i64 = zext <2 x i32> undef to <2 x i64>
+ %s2i8i16 = sext <2 x i8> poison to <2 x i16>
+ %z2i8i16 = zext <2 x i8> poison to <2 x i16>
+ %s2i8i32 = sext <2 x i8> poison to <2 x i32>
+ %z2i8i32 = zext <2 x i8> poison to <2 x i32>
+ %s2i8i64 = sext <2 x i8> poison to <2 x i64>
+ %z2i8i64 = zext <2 x i8> poison to <2 x i64>
+ %s2i16i32 = sext <2 x i16> poison to <2 x i32>
+ %z2i16i32 = zext <2 x i16> poison to <2 x i32>
+ %s2i16i64 = sext <2 x i16> poison to <2 x i64>
+ %z2i16i64 = zext <2 x i16> poison to <2 x i64>
+ %s2i32i64 = sext <2 x i32> poison to <2 x i64>
+ %z2i32i64 = zext <2 x i32> poison to <2 x i64>
- %s4i8i16 = sext <4 x i8> undef to <4 x i16>
- %z4i8i16 = zext <4 x i8> undef to <4 x i16>
- %s4i8i32 = sext <4 x i8> undef to <4 x i32>
- %z4i8i32 = zext <4 x i8> undef to <4 x i32>
- %s4i8i64 = sext <4 x i8> undef to <4 x i64>
- %z4i8i64 = zext <4 x i8> undef to <4 x i64>
- %s4i16i32 = sext <4 x i16> undef to <4 x i32>
- %z4i16i32 = zext <4 x i16> undef to <4 x i32>
- %s4i16i64 = sext <4 x i16> undef to <4 x i64>
- %z4i16i64 = zext <4 x i16> undef to <4 x i64>
- %s4i32i64 = sext <4 x i32> undef to <4 x i64>
- %z4i32i64 = zext <4 x i32> undef to <4 x i64>
+ %s4i8i16 = sext <4 x i8> poison to <4 x i16>
+ %z4i8i16 = zext <4 x i8> poison to <4 x i16>
+ %s4i8i32 = sext <4 x i8> poison to <4 x i32>
+ %z4i8i32 = zext <4 x i8> poison to <4 x i32>
+ %s4i8i64 = sext <4 x i8> poison to <4 x i64>
+ %z4i8i64 = zext <4 x i8> poison to <4 x i64>
+ %s4i16i32 = sext <4 x i16> poison to <4 x i32>
+ %z4i16i32 = zext <4 x i16> poison to <4 x i32>
+ %s4i16i64 = sext <4 x i16> poison to <4 x i64>
+ %z4i16i64 = zext <4 x i16> poison to <4 x i64>
+ %s4i32i64 = sext <4 x i32> poison to <4 x i64>
+ %z4i32i64 = zext <4 x i32> poison to <4 x i64>
- %s8i8i16 = sext <8 x i8> undef to <8 x i16>
- %z8i8i16 = zext <8 x i8> undef to <8 x i16>
- %s8i8i32 = sext <8 x i8> undef to <8 x i32>
- %z8i8i32 = zext <8 x i8> undef to <8 x i32>
- %s8i8i64 = sext <8 x i8> undef to <8 x i64>
- %z8i8i64 = zext <8 x i8> undef to <8 x i64>
- %s8i16i32 = sext <8 x i16> undef to <8 x i32>
- %z8i16i32 = zext <8 x i16> undef to <8 x i32>
- %s8i16i64 = sext <8 x i16> undef to <8 x i64>
- %z8i16i64 = zext <8 x i16> undef to <8 x i64>
- %s8i32i64 = sext <8 x i32> undef to <8 x i64>
- %z8i32i64 = zext <8 x i32> undef to <8 x i64>
+ %s8i8i16 = sext <8 x i8> poison to <8 x i16>
+ %z8i8i16 = zext <8 x i8> poison to <8 x i16>
+ %s8i8i32 = sext <8 x i8> poison to <8 x i32>
+ %z8i8i32 = zext <8 x i8> poison to <8 x i32>
+ %s8i8i64 = sext <8 x i8> poison to <8 x i64>
+ %z8i8i64 = zext <8 x i8> poison to <8 x i64>
+ %s8i16i32 = sext <8 x i16> poison to <8 x i32>
+ %z8i16i32 = zext <8 x i16> poison to <8 x i32>
+ %s8i16i64 = sext <8 x i16> poison to <8 x i64>
+ %z8i16i64 = zext <8 x i16> poison to <8 x i64>
+ %s8i32i64 = sext <8 x i32> poison to <8 x i64>
+ %z8i32i64 = zext <8 x i32> poison to <8 x i64>
- %s16i8i16 = sext <16 x i8> undef to <16 x i16>
- %z16i8i16 = zext <16 x i8> undef to <16 x i16>
- %s16i8i32 = sext <16 x i8> undef to <16 x i32>
- %z16i8i32 = zext <16 x i8> undef to <16 x i32>
- %s16i8i64 = sext <16 x i8> undef to <16 x i64>
- %z16i8i64 = zext <16 x i8> undef to <16 x i64>
- %s16i16i32 = sext <16 x i16> undef to <16 x i32>
- %z16i16i32 = zext <16 x i16> undef to <16 x i32>
- %s16i16i64 = sext <16 x i16> undef to <16 x i64>
- %z16i16i64 = zext <16 x i16> undef to <16 x i64>
- %s16i32i64 = sext <16 x i32> undef to <16 x i64>
- %z16i32i64 = zext <16 x i32> undef to <16 x i64>
+ %s16i8i16 = sext <16 x i8> poison to <16 x i16>
+ %z16i8i16 = zext <16 x i8> poison to <16 x i16>
+ %s16i8i32 = sext <16 x i8> poison to <16 x i32>
+ %z16i8i32 = zext <16 x i8> poison to <16 x i32>
+ %s16i8i64 = sext <16 x i8> poison to <16 x i64>
+ %z16i8i64 = zext <16 x i8> poison to <16 x i64>
+ %s16i16i32 = sext <16 x i16> poison to <16 x i32>
+ %z16i16i32 = zext <16 x i16> poison to <16 x i32>
+ %s16i16i64 = sext <16 x i16> poison to <16 x i64>
+ %z16i16i64 = zext <16 x i16> poison to <16 x i64>
+ %s16i32i64 = sext <16 x i32> poison to <16 x i64>
+ %z16i32i64 = zext <16 x i32> poison to <16 x i64>
ret void
}
define void @trunc() {
; CHECK-LABEL: 'trunc'
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r8 = trunc i8 undef to i1
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r15 = trunc i16 undef to i1
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r16 = trunc i16 undef to i8
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r21 = trunc i32 undef to i1
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r22 = trunc i32 undef to i8
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r23 = trunc i32 undef to i16
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r26 = trunc i64 undef to i1
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r27 = trunc i64 undef to i8
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r28 = trunc i64 undef to i16
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r29 = trunc i64 undef to i32
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %s2i8i16 = trunc <2 x i16> undef to <2 x i8>
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %s2i8i32 = trunc <2 x i32> undef to <2 x i8>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %s2i8i64 = trunc <2 x i64> undef to <2 x i8>
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %s2i16i32 = trunc <2 x i32> undef to <2 x i16>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %s2i16i64 = trunc <2 x i64> undef to <2 x i16>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %s2i32i64 = trunc <2 x i64> undef to <2 x i32>
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %s4i8i16 = trunc <4 x i16> undef to <4 x i8>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %s4i8i32 = trunc <4 x i32> undef to <4 x i8>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s4i8i64 = trunc <4 x i64> undef to <4 x i8>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %s4i16i32 = trunc <4 x i32> undef to <4 x i16>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s4i16i64 = trunc <4 x i64> undef to <4 x i16>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %s4i32i64 = trunc <4 x i64> undef to <4 x i32>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %s8i8i16 = trunc <8 x i16> undef to <8 x i8>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i32 = trunc <8 x i32> undef to <8 x i8>
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i64 = trunc <8 x i64> undef to <8 x i8>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %s8i16i32 = trunc <8 x i32> undef to <8 x i16>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i64 = trunc <8 x i64> undef to <8 x i16>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i32i64 = trunc <8 x i64> undef to <8 x i32>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %s16i8i16 = trunc <16 x i16> undef to <16 x i8>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i32 = trunc <16 x i32> undef to <16 x i8>
-; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i64 = trunc <16 x i64> undef to <16 x i8>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i32 = trunc <16 x i32> undef to <16 x i16>
-; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i64 = trunc <16 x i64> undef to <16 x i16>
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i32i64 = trunc <16 x i64> undef to <16 x i32>
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r8 = trunc i8 poison to i1
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r15 = trunc i16 poison to i1
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r16 = trunc i16 poison to i8
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r21 = trunc i32 poison to i1
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r22 = trunc i32 poison to i8
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r23 = trunc i32 poison to i16
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r26 = trunc i64 poison to i1
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r27 = trunc i64 poison to i8
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r28 = trunc i64 poison to i16
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r29 = trunc i64 poison to i32
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %s2i8i16 = trunc <2 x i16> poison to <2 x i8>
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %s2i8i32 = trunc <2 x i32> poison to <2 x i8>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %s2i8i64 = trunc <2 x i64> poison to <2 x i8>
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %s2i16i32 = trunc <2 x i32> poison to <2 x i16>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %s2i16i64 = trunc <2 x i64> poison to <2 x i16>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %s2i32i64 = trunc <2 x i64> poison to <2 x i32>
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %s4i8i16 = trunc <4 x i16> poison to <4 x i8>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %s4i8i32 = trunc <4 x i32> poison to <4 x i8>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s4i8i64 = trunc <4 x i64> poison to <4 x i8>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %s4i16i32 = trunc <4 x i32> poison to <4 x i16>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s4i16i64 = trunc <4 x i64> poison to <4 x i16>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %s4i32i64 = trunc <4 x i64> poison to <4 x i32>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %s8i8i16 = trunc <8 x i16> poison to <8 x i8>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i32 = trunc <8 x i32> poison to <8 x i8>
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i64 = trunc <8 x i64> poison to <8 x i8>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %s8i16i32 = trunc <8 x i32> poison to <8 x i16>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i64 = trunc <8 x i64> poison to <8 x i16>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i32i64 = trunc <8 x i64> poison to <8 x i32>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %s16i8i16 = trunc <16 x i16> poison to <16 x i8>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i32 = trunc <16 x i32> poison to <16 x i8>
+; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i64 = trunc <16 x i64> poison to <16 x i8>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i32 = trunc <16 x i32> poison to <16 x i16>
+; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i64 = trunc <16 x i64> poison to <16 x i16>
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i32i64 = trunc <16 x i64> poison to <16 x i32>
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
- %r8 = trunc i8 undef to i1
- %r15 = trunc i16 undef to i1
- %r16 = trunc i16 undef to i8
- %r21 = trunc i32 undef to i1
- %r22 = trunc i32 undef to i8
- %r23 = trunc i32 undef to i16
- %r26 = trunc i64 undef to i1
- %r27 = trunc i64 undef to i8
- %r28 = trunc i64 undef to i16
- %r29 = trunc i64 undef to i32
+ %r8 = trunc i8 poison to i1
+ %r15 = trunc i16 poison to i1
+ %r16 = trunc i16 poison to i8
+ %r21 = trunc i32 poison to i1
+ %r22 = trunc i32 poison to i8
+ %r23 = trunc i32 poison to i16
+ %r26 = trunc i64 poison to i1
+ %r27 = trunc i64 poison to i8
+ %r28 = trunc i64 poison to i16
+ %r29 = trunc i64 poison to i32
- %s2i8i16 = trunc <2 x i16> undef to <2 x i8>
- %s2i8i32 = trunc <2 x i32> undef to <2 x i8>
- %s2i8i64 = trunc <2 x i64> undef to <2 x i8>
- %s2i16i32 = trunc <2 x i32> undef to <2 x i16>
- %s2i16i64 = trunc <2 x i64> undef to <2 x i16>
- %s2i32i64 = trunc <2 x i64> undef to <2 x i32>
+ %s2i8i16 = trunc <2 x i16> poison to <2 x i8>
+ %s2i8i32 = trunc <2 x i32> poison to <2 x i8>
+ %s2i8i64 = trunc <2 x i64> poison to <2 x i8>
+ %s2i16i32 = trunc <2 x i32> poison to <2 x i16>
+ %s2i16i64 = trunc <2 x i64> poison to <2 x i16>
+ %s2i32i64 = trunc <2 x i64> poison to <2 x i32>
- %s4i8i16 = trunc <4 x i16> undef to <4 x i8>
- %s4i8i32 = trunc <4 x i32> undef to <4 x i8>
- %s4i8i64 = trunc <4 x i64> undef to <4 x i8>
- %s4i16i32 = trunc <4 x i32> undef to <4 x i16>
- %s4i16i64 = trunc <4 x i64> undef to <4 x i16>
- %s4i32i64 = trunc <4 x i64> undef to <4 x i32>
+ %s4i8i16 = trunc <4 x i16> poison to <4 x i8>
+ %s4i8i32 = trunc <4 x i32> poison to <4 x i8>
+ %s4i8i64 = trunc <4 x i64> poison to <4 x i8>
+ %s4i16i32 = trunc <4 x i32> poison to <4 x i16>
+ %s4i16i64 = trunc <4 x i64> poison to <4 x i16>
+ %s4i32i64 = trunc <4 x i64> poison to <4 x i32>
- %s8i8i16 = trunc <8 x i16> undef to <8 x i8>
- %s8i8i32 = trunc <8 x i32> undef to <8 x i8>
- %s8i8i64 = trunc <8 x i64> undef to <8 x i8>
- %s8i16i32 = trunc <8 x i32> undef to <8 x i16>
- %s8i16i64 = trunc <8 x i64> undef to <8 x i16>
- %s8i32i64 = trunc <8 x i64> undef to <8 x i32>
+ %s8i8i16 = trunc <8 x i16> poison to <8 x i8>
+ %s8i8i32 = trunc <8 x i32> poison to <8 x i8>
+ %s8i8i64 = trunc <8 x i64> poison to <8 x i8>
+ %s8i16i32 = trunc <8 x i32> poison to <8 x i16>
+ %s8i16i64 = trunc <8 x i64> poison to <8 x i16>
+ %s8i32i64 = trunc <8 x i64> poison to <8 x i32>
- %s16i8i16 = trunc <16 x i16> undef to <16 x i8>
- %s16i8i32 = trunc <16 x i32> undef to <16 x i8>
- %s16i8i64 = trunc <16 x i64> undef to <16 x i8>
- %s16i16i32 = trunc <16 x i32> undef to <16 x i16>
- %s16i16i64 = trunc <16 x i64> undef to <16 x i16>
- %s16i32i64 = trunc <16 x i64> undef to <16 x i32>
+ %s16i8i16 = trunc <16 x i16> poison to <16 x i8>
+ %s16i8i32 = trunc <16 x i32> poison to <16 x i8>
+ %s16i8i64 = trunc <16 x i64> poison to <16 x i8>
+ %s16i16i32 = trunc <16 x i32> poison to <16 x i16>
+ %s16i16i64 = trunc <16 x i64> poison to <16 x i16>
+ %s16i32i64 = trunc <16 x i64> poison to <16 x i32>
ret void
}
define i32 @casts_no_users() {
; CHECK-LABEL: 'casts_no_users'
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r30 = fptoui float undef to i1
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r31 = fptosi float undef to i1
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r32 = fptoui float undef to i8
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r33 = fptosi float undef to i8
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r34 = fptoui float undef to i16
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r35 = fptosi float undef to i16
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r36 = fptoui float undef to i32
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r37 = fptosi float undef to i32
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r38 = fptoui float undef to i64
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r39 = fptosi float undef to i64
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r40 = fptoui double undef to i1
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r41 = fptosi double undef to i1
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r42 = fptoui double undef to i8
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r43 = fptosi double undef to i8
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r44 = fptoui double undef to i16
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r45 = fptosi double undef to i16
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r46 = fptoui double undef to i32
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r47 = fptosi double undef to i32
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r48 = fptoui double undef to i64
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r49 = fptosi double undef to i64
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r50 = sitofp i1 undef to float
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r51 = uitofp i1 undef to float
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r52 = sitofp i1 undef to double
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r53 = uitofp i1 undef to double
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r54 = sitofp i8 undef to float
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r55 = uitofp i8 undef to float
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r56 = sitofp i8 undef to double
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r57 = uitofp i8 undef to double
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r58 = sitofp i16 undef to float
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r59 = uitofp i16 undef to float
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r60 = sitofp i16 undef to double
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r61 = uitofp i16 undef to double
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r62 = sitofp i32 undef to float
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r63 = uitofp i32 undef to float
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r64 = sitofp i32 undef to double
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r65 = uitofp i32 undef to double
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r66 = sitofp i64 undef to float
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r67 = uitofp i64 undef to float
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r68 = sitofp i64 undef to double
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r69 = uitofp i64 undef to double
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r80 = fptrunc double undef to float
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r81 = fptrunc <2 x double> undef to <2 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r82 = fptrunc <4 x double> undef to <4 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r83 = fptrunc <8 x double> undef to <8 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r84 = fptrunc <16 x double> undef to <16 x float>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %truncf64f16 = fptrunc double undef to half
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %truncv2f64f16 = fptrunc <2 x double> undef to <2 x half>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %truncv4f64f16 = fptrunc <4 x double> undef to <4 x half>
-; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %truncv8f64f16 = fptrunc <8 x double> undef to <8 x half>
-; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %truncv16f64f16 = fptrunc <16 x double> undef to <16 x half>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %truncv32f16 = fptrunc float undef to half
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %truncv2f32f16 = fptrunc <2 x float> undef to <2 x half>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %truncv4f32f16 = fptrunc <4 x float> undef to <4 x half>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %truncv8f32f16 = fptrunc <8 x float> undef to <8 x half>
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %truncv16f32f16 = fptrunc <16 x float> undef to <16 x half>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r85 = fpext float undef to double
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r86 = fpext <2 x float> undef to <2 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r87 = fpext <4 x float> undef to <4 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r88 = fpext <8 x float> undef to <8 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r89 = fpext <16 x float> undef to <16 x double>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %extf16f32 = fpext half undef to float
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %extv2f16f32 = fpext <2 x half> undef to <2 x float>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %extv4f16f32 = fpext <4 x half> undef to <4 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %extv8f16f32 = fpext <8 x half> undef to <8 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %extv16f16f32 = fpext <16 x half> undef to <16 x float>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %extf16f64 = fpext half undef to double
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %extv2f16f64 = fpext <2 x half> undef to <2 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %extv4f16f64 = fpext <4 x half> undef to <4 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %extv8f16f64 = fpext <8 x half> undef to <8 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %extv16f16f64 = fpext <16 x half> undef to <16 x double>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r90 = fptoui <2 x float> undef to <2 x i1>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r91 = fptosi <2 x float> undef to <2 x i1>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r92 = fptoui <2 x float> undef to <2 x i8>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r93 = fptosi <2 x float> undef to <2 x i8>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r94 = fptoui <2 x float> undef to <2 x i16>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r95 = fptosi <2 x float> undef to <2 x i16>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r96 = fptoui <2 x float> undef to <2 x i32>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r97 = fptosi <2 x float> undef to <2 x i32>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x float> undef to <2 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x float> undef to <2 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r100 = fptoui <2 x double> undef to <2 x i1>
-; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r101 = fptosi <2 x double> undef to <2 x i1>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r102 = fptoui <2 x double> undef to <2 x i8>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r103 = fptosi <2 x double> undef to <2 x i8>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r104 = fptoui <2 x double> undef to <2 x i16>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r105 = fptosi <2 x double> undef to <2 x i16>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r106 = fptoui <2 x double> undef to <2 x i32>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r107 = fptosi <2 x double> undef to <2 x i32>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r108 = fptoui <2 x double> undef to <2 x i64>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r109 = fptosi <2 x double> undef to <2 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r110 = fptoui <4 x float> undef to <4 x i1>
-; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r111 = fptosi <4 x float> undef to <4 x i1>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r112 = fptoui <4 x float> undef to <4 x i8>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r113 = fptosi <4 x float> undef to <4 x i8>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r114 = fptoui <4 x float> undef to <4 x i16>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r115 = fptosi <4 x float> undef to <4 x i16>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r116 = fptoui <4 x float> undef to <4 x i32>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r117 = fptosi <4 x float> undef to <4 x i32>
-; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r118 = fptoui <4 x float> undef to <4 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r119 = fptosi <4 x float> undef to <4 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r120 = fptoui <4 x double> undef to <4 x i1>
-; CHECK-NEXT: Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r121 = fptosi <4 x double> undef to <4 x i1>
-; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r122 = fptoui <4 x double> undef to <4 x i8>
-; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r123 = fptosi <4 x double> undef to <4 x i8>
-; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r124 = fptoui <4 x double> undef to <4 x i16>
-; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r125 = fptosi <4 x double> undef to <4 x i16>
-; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r126 = fptoui <4 x double> undef to <4 x i32>
-; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r127 = fptosi <4 x double> undef to <4 x i32>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r128 = fptoui <4 x double> undef to <4 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r129 = fptosi <4 x double> undef to <4 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:41 CodeSize:1 Lat:1 SizeLat:1 for: %r130 = fptoui <8 x float> undef to <8 x i1>
-; CHECK-NEXT: Cost Model: Found costs of RThru:41 CodeSize:1 Lat:1 SizeLat:1 for: %r131 = fptosi <8 x float> undef to <8 x i1>
-; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r132 = fptoui <8 x float> undef to <8 x i8>
-; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r133 = fptosi <8 x float> undef to <8 x i8>
-; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r134 = fptoui <8 x float> undef to <8 x i16>
-; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r135 = fptosi <8 x float> undef to <8 x i16>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r136 = fptoui <8 x float> undef to <8 x i32>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r137 = fptosi <8 x float> undef to <8 x i32>
-; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x float> undef to <8 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x float> undef to <8 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r140 = fptoui <8 x double> undef to <8 x i1>
-; CHECK-NEXT: Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r141 = fptosi <8 x double> undef to <8 x i1>
-; CHECK-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r142 = fptoui <8 x double> undef to <8 x i8>
-; CHECK-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r143 = fptosi <8 x double> undef to <8 x i8>
-; CHECK-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r144 = fptoui <8 x double> undef to <8 x i16>
-; CHECK-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r145 = fptosi <8 x double> undef to <8 x i16>
-; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r146 = fptoui <8 x double> undef to <8 x i32>
-; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r147 = fptosi <8 x double> undef to <8 x i32>
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r148 = fptoui <8 x double> undef to <8 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r149 = fptosi <8 x double> undef to <8 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:83 CodeSize:1 Lat:1 SizeLat:1 for: %r150 = fptoui <16 x float> undef to <16 x i1>
-; CHECK-NEXT: Cost Model: Found costs of RThru:83 CodeSize:1 Lat:1 SizeLat:1 for: %r151 = fptosi <16 x float> undef to <16 x i1>
-; CHECK-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r152 = fptoui <16 x float> undef to <16 x i8>
-; CHECK-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r153 = fptosi <16 x float> undef to <16 x i8>
-; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r154 = fptoui <16 x float> undef to <16 x i16>
-; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r155 = fptosi <16 x float> undef to <16 x i16>
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x float> undef to <16 x i32>
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x float> undef to <16 x i32>
-; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x float> undef to <16 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x float> undef to <16 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:87 CodeSize:1 Lat:1 SizeLat:1 for: %r160 = fptoui <16 x double> undef to <16 x i1>
-; CHECK-NEXT: Cost Model: Found costs of RThru:87 CodeSize:1 Lat:1 SizeLat:1 for: %r161 = fptosi <16 x double> undef to <16 x i1>
-; CHECK-NEXT: Cost Model: Found costs of RThru:23 CodeSize:1 Lat:1 SizeLat:1 for: %r162 = fptoui <16 x double> undef to <16 x i8>
-; CHECK-NEXT: Cost Model: Found costs of RThru:23 CodeSize:1 Lat:1 SizeLat:1 for: %r163 = fptosi <16 x double> undef to <16 x i8>
-; CHECK-NEXT: Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %r164 = fptoui <16 x double> undef to <16 x i16>
-; CHECK-NEXT: Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %r165 = fptosi <16 x double> undef to <16 x i16>
-; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r166 = fptoui <16 x double> undef to <16 x i32>
-; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r167 = fptosi <16 x double> undef to <16 x i32>
-; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r168 = fptoui <16 x double> undef to <16 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r169 = fptosi <16 x double> undef to <16 x i64>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r170 = uitofp <2 x i1> undef to <2 x float>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r171 = sitofp <2 x i1> undef to <2 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r172 = uitofp <2 x i8> undef to <2 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r173 = sitofp <2 x i8> undef to <2 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r174 = uitofp <2 x i16> undef to <2 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r175 = sitofp <2 x i16> undef to <2 x float>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r176 = uitofp <2 x i32> undef to <2 x float>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r177 = sitofp <2 x i32> undef to <2 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r178 = uitofp <2 x i64> undef to <2 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r179 = sitofp <2 x i64> undef to <2 x float>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r180 = uitofp <2 x i1> undef to <2 x double>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r181 = sitofp <2 x i1> undef to <2 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r182 = uitofp <2 x i8> undef to <2 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r183 = sitofp <2 x i8> undef to <2 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r184 = uitofp <2 x i16> undef to <2 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r185 = sitofp <2 x i16> undef to <2 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r186 = uitofp <2 x i32> undef to <2 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r187 = sitofp <2 x i32> undef to <2 x double>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r188 = uitofp <2 x i64> undef to <2 x double>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r189 = sitofp <2 x i64> undef to <2 x double>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r190 = uitofp <4 x i1> undef to <4 x float>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r191 = sitofp <4 x i1> undef to <4 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r192 = uitofp <4 x i8> undef to <4 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r193 = sitofp <4 x i8> undef to <4 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r194 = uitofp <4 x i16> undef to <4 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r195 = sitofp <4 x i16> undef to <4 x float>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r196 = uitofp <4 x i32> undef to <4 x float>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %r197 = sitofp <4 x i32> undef to <4 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: %r198 = uitofp <4 x i64> undef to <4 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: %r199 = sitofp <4 x i64> undef to <4 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r200 = uitofp <4 x i1> undef to <4 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r201 = sitofp <4 x i1> undef to <4 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %r202 = uitofp <4 x i8> undef to <4 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %r203 = sitofp <4 x i8> undef to <4 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %r204 = uitofp <4 x i16> undef to <4 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %r205 = sitofp <4 x i16> undef to <4 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r206 = uitofp <4 x i32> undef to <4 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r207 = sitofp <4 x i32> undef to <4 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r208 = uitofp <4 x i64> undef to <4 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r209 = sitofp <4 x i64> undef to <4 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r210 = uitofp <8 x i1> undef to <8 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r211 = sitofp <8 x i1> undef to <8 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r212 = uitofp <8 x i8> undef to <8 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r213 = sitofp <8 x i8> undef to <8 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r214 = uitofp <8 x i16> undef to <8 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r215 = sitofp <8 x i16> undef to <8 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r216 = uitofp <8 x i32> undef to <8 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r217 = sitofp <8 x i32> undef to <8 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:36 CodeSize:1 Lat:1 SizeLat:1 for: %r218 = uitofp <8 x i64> undef to <8 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:36 CodeSize:1 Lat:1 SizeLat:1 for: %r219 = sitofp <8 x i64> undef to <8 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r220 = uitofp <8 x i1> undef to <8 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r221 = sitofp <8 x i1> undef to <8 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r222 = uitofp <8 x i8> undef to <8 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r223 = sitofp <8 x i8> undef to <8 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r224 = uitofp <8 x i16> undef to <8 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r225 = sitofp <8 x i16> undef to <8 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r226 = uitofp <8 x i16> undef to <8 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r227 = sitofp <8 x i16> undef to <8 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r228 = uitofp <8 x i64> undef to <8 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r229 = sitofp <8 x i64> undef to <8 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r230 = uitofp <16 x i1> undef to <16 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r231 = sitofp <16 x i1> undef to <16 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r232 = uitofp <16 x i8> undef to <16 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r233 = sitofp <16 x i8> undef to <16 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r234 = uitofp <16 x i16> undef to <16 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r235 = sitofp <16 x i16> undef to <16 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r236 = uitofp <16 x i32> undef to <16 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r237 = sitofp <16 x i32> undef to <16 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:72 CodeSize:1 Lat:1 SizeLat:1 for: %r238 = uitofp <16 x i64> undef to <16 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:72 CodeSize:1 Lat:1 SizeLat:1 for: %r239 = sitofp <16 x i64> undef to <16 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %r240 = uitofp <16 x i1> undef to <16 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %r241 = sitofp <16 x i1> undef to <16 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:39 CodeSize:1 Lat:1 SizeLat:1 for: %r242 = uitofp <16 x i8> undef to <16 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:39 CodeSize:1 Lat:1 SizeLat:1 for: %r243 = sitofp <16 x i8> undef to <16 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:38 CodeSize:1 Lat:1 SizeLat:1 for: %r244 = uitofp <16 x i16> undef to <16 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:38 CodeSize:1 Lat:1 SizeLat:1 for: %r245 = sitofp <16 x i16> undef to <16 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:38 CodeSize:1 Lat:1 SizeLat:1 for: %r246 = uitofp <16 x i16> undef to <16 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:38 CodeSize:1 Lat:1 SizeLat:1 for: %r247 = sitofp <16 x i16> undef to <16 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r248 = uitofp <16 x i64> undef to <16 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r249 = sitofp <16 x i64> undef to <16 x double>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r30 = fptoui float poison to i1
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r31 = fptosi float poison to i1
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r32 = fptoui float poison to i8
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r33 = fptosi float poison to i8
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r34 = fptoui float poison to i16
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r35 = fptosi float poison to i16
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r36 = fptoui float poison to i32
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r37 = fptosi float poison to i32
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r38 = fptoui float poison to i64
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r39 = fptosi float poison to i64
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r40 = fptoui double poison to i1
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r41 = fptosi double poison to i1
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r42 = fptoui double poison to i8
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r43 = fptosi double poison to i8
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r44 = fptoui double poison to i16
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r45 = fptosi double poison to i16
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r46 = fptoui double poison to i32
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r47 = fptosi double poison to i32
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r48 = fptoui double poison to i64
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r49 = fptosi double poison to i64
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r50 = sitofp i1 poison to float
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r51 = uitofp i1 poison to float
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r52 = sitofp i1 poison to double
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r53 = uitofp i1 poison to double
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r54 = sitofp i8 poison to float
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r55 = uitofp i8 poison to float
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r56 = sitofp i8 poison to double
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r57 = uitofp i8 poison to double
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r58 = sitofp i16 poison to float
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r59 = uitofp i16 poison to float
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r60 = sitofp i16 poison to double
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r61 = uitofp i16 poison to double
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r62 = sitofp i32 poison to float
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r63 = uitofp i32 poison to float
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r64 = sitofp i32 poison to double
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r65 = uitofp i32 poison to double
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r66 = sitofp i64 poison to float
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r67 = uitofp i64 poison to float
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r68 = sitofp i64 poison to double
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r69 = uitofp i64 poison to double
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r80 = fptrunc double poison to float
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r81 = fptrunc <2 x double> poison to <2 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r82 = fptrunc <4 x double> poison to <4 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r83 = fptrunc <8 x double> poison to <8 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r84 = fptrunc <16 x double> poison to <16 x float>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %truncf64f16 = fptrunc double poison to half
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %truncv2f64f16 = fptrunc <2 x double> poison to <2 x half>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %truncv4f64f16 = fptrunc <4 x double> poison to <4 x half>
+; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %truncv8f64f16 = fptrunc <8 x double> poison to <8 x half>
+; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %truncv16f64f16 = fptrunc <16 x double> poison to <16 x half>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %truncv32f16 = fptrunc float poison to half
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %truncv2f32f16 = fptrunc <2 x float> poison to <2 x half>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %truncv4f32f16 = fptrunc <4 x float> poison to <4 x half>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %truncv8f32f16 = fptrunc <8 x float> poison to <8 x half>
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %truncv16f32f16 = fptrunc <16 x float> poison to <16 x half>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r85 = fpext float poison to double
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r86 = fpext <2 x float> poison to <2 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r87 = fpext <4 x float> poison to <4 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r88 = fpext <8 x float> poison to <8 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r89 = fpext <16 x float> poison to <16 x double>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %extf16f32 = fpext half poison to float
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %extv2f16f32 = fpext <2 x half> poison to <2 x float>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %extv4f16f32 = fpext <4 x half> poison to <4 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %extv8f16f32 = fpext <8 x half> poison to <8 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %extv16f16f32 = fpext <16 x half> poison to <16 x float>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %extf16f64 = fpext half poison to double
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %extv2f16f64 = fpext <2 x half> poison to <2 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %extv4f16f64 = fpext <4 x half> poison to <4 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %extv8f16f64 = fpext <8 x half> poison to <8 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %extv16f16f64 = fpext <16 x half> poison to <16 x double>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r90 = fptoui <2 x float> poison to <2 x i1>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r91 = fptosi <2 x float> poison to <2 x i1>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r92 = fptoui <2 x float> poison to <2 x i8>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r93 = fptosi <2 x float> poison to <2 x i8>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r94 = fptoui <2 x float> poison to <2 x i16>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r95 = fptosi <2 x float> poison to <2 x i16>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r96 = fptoui <2 x float> poison to <2 x i32>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r97 = fptosi <2 x float> poison to <2 x i32>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x float> poison to <2 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x float> poison to <2 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r100 = fptoui <2 x double> poison to <2 x i1>
+; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r101 = fptosi <2 x double> poison to <2 x i1>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r102 = fptoui <2 x double> poison to <2 x i8>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r103 = fptosi <2 x double> poison to <2 x i8>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r104 = fptoui <2 x double> poison to <2 x i16>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r105 = fptosi <2 x double> poison to <2 x i16>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r106 = fptoui <2 x double> poison to <2 x i32>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r107 = fptosi <2 x double> poison to <2 x i32>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r108 = fptoui <2 x double> poison to <2 x i64>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r109 = fptosi <2 x double> poison to <2 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r110 = fptoui <4 x float> poison to <4 x i1>
+; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r111 = fptosi <4 x float> poison to <4 x i1>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r112 = fptoui <4 x float> poison to <4 x i8>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r113 = fptosi <4 x float> poison to <4 x i8>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r114 = fptoui <4 x float> poison to <4 x i16>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r115 = fptosi <4 x float> poison to <4 x i16>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r116 = fptoui <4 x float> poison to <4 x i32>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r117 = fptosi <4 x float> poison to <4 x i32>
+; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r118 = fptoui <4 x float> poison to <4 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r119 = fptosi <4 x float> poison to <4 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r120 = fptoui <4 x double> poison to <4 x i1>
+; CHECK-NEXT: Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r121 = fptosi <4 x double> poison to <4 x i1>
+; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r122 = fptoui <4 x double> poison to <4 x i8>
+; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r123 = fptosi <4 x double> poison to <4 x i8>
+; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r124 = fptoui <4 x double> poison to <4 x i16>
+; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r125 = fptosi <4 x double> poison to <4 x i16>
+; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r126 = fptoui <4 x double> poison to <4 x i32>
+; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r127 = fptosi <4 x double> poison to <4 x i32>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r128 = fptoui <4 x double> poison to <4 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r129 = fptosi <4 x double> poison to <4 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:41 CodeSize:1 Lat:1 SizeLat:1 for: %r130 = fptoui <8 x float> poison to <8 x i1>
+; CHECK-NEXT: Cost Model: Found costs of RThru:41 CodeSize:1 Lat:1 SizeLat:1 for: %r131 = fptosi <8 x float> poison to <8 x i1>
+; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r132 = fptoui <8 x float> poison to <8 x i8>
+; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r133 = fptosi <8 x float> poison to <8 x i8>
+; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r134 = fptoui <8 x float> poison to <8 x i16>
+; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r135 = fptosi <8 x float> poison to <8 x i16>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r136 = fptoui <8 x float> poison to <8 x i32>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r137 = fptosi <8 x float> poison to <8 x i32>
+; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x float> poison to <8 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x float> poison to <8 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r140 = fptoui <8 x double> poison to <8 x i1>
+; CHECK-NEXT: Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r141 = fptosi <8 x double> poison to <8 x i1>
+; CHECK-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r142 = fptoui <8 x double> poison to <8 x i8>
+; CHECK-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r143 = fptosi <8 x double> poison to <8 x i8>
+; CHECK-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r144 = fptoui <8 x double> poison to <8 x i16>
+; CHECK-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r145 = fptosi <8 x double> poison to <8 x i16>
+; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r146 = fptoui <8 x double> poison to <8 x i32>
+; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r147 = fptosi <8 x double> poison to <8 x i32>
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r148 = fptoui <8 x double> poison to <8 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r149 = fptosi <8 x double> poison to <8 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:83 CodeSize:1 Lat:1 SizeLat:1 for: %r150 = fptoui <16 x float> poison to <16 x i1>
+; CHECK-NEXT: Cost Model: Found costs of RThru:83 CodeSize:1 Lat:1 SizeLat:1 for: %r151 = fptosi <16 x float> poison to <16 x i1>
+; CHECK-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r152 = fptoui <16 x float> poison to <16 x i8>
+; CHECK-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r153 = fptosi <16 x float> poison to <16 x i8>
+; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r154 = fptoui <16 x float> poison to <16 x i16>
+; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r155 = fptosi <16 x float> poison to <16 x i16>
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x float> poison to <16 x i32>
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x float> poison to <16 x i32>
+; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x float> poison to <16 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x float> poison to <16 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:87 CodeSize:1 Lat:1 SizeLat:1 for: %r160 = fptoui <16 x double> poison to <16 x i1>
+; CHECK-NEXT: Cost Model: Found costs of RThru:87 CodeSize:1 Lat:1 SizeLat:1 for: %r161 = fptosi <16 x double> poison to <16 x i1>
+; CHECK-NEXT: Cost Model: Found costs of RThru:23 CodeSize:1 Lat:1 SizeLat:1 for: %r162 = fptoui <16 x double> poison to <16 x i8>
+; CHECK-NEXT: Cost Model: Found costs of RThru:23 CodeSize:1 Lat:1 SizeLat:1 for: %r163 = fptosi <16 x double> poison to <16 x i8>
+; CHECK-NEXT: Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %r164 = fptoui <16 x double> poison to <16 x i16>
+; CHECK-NEXT: Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %r165 = fptosi <16 x double> poison to <16 x i16>
+; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r166 = fptoui <16 x double> poison to <16 x i32>
+; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r167 = fptosi <16 x double> poison to <16 x i32>
+; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r168 = fptoui <16 x double> poison to <16 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r169 = fptosi <16 x double> poison to <16 x i64>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r170 = uitofp <2 x i1> poison to <2 x float>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r171 = sitofp <2 x i1> poison to <2 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r172 = uitofp <2 x i8> poison to <2 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r173 = sitofp <2 x i8> poison to <2 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r174 = uitofp <2 x i16> poison to <2 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r175 = sitofp <2 x i16> poison to <2 x float>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r176 = uitofp <2 x i32> poison to <2 x float>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r177 = sitofp <2 x i32> poison to <2 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r178 = uitofp <2 x i64> poison to <2 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r179 = sitofp <2 x i64> poison to <2 x float>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r180 = uitofp <2 x i1> poison to <2 x double>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r181 = sitofp <2 x i1> poison to <2 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r182 = uitofp <2 x i8> poison to <2 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r183 = sitofp <2 x i8> poison to <2 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r184 = uitofp <2 x i16> poison to <2 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r185 = sitofp <2 x i16> poison to <2 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r186 = uitofp <2 x i32> poison to <2 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r187 = sitofp <2 x i32> poison to <2 x double>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r188 = uitofp <2 x i64> poison to <2 x double>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r189 = sitofp <2 x i64> poison to <2 x double>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r190 = uitofp <4 x i1> poison to <4 x float>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r191 = sitofp <4 x i1> poison to <4 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r192 = uitofp <4 x i8> poison to <4 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r193 = sitofp <4 x i8> poison to <4 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r194 = uitofp <4 x i16> poison to <4 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r195 = sitofp <4 x i16> poison to <4 x float>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r196 = uitofp <4 x i32> poison to <4 x float>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %r197 = sitofp <4 x i32> poison to <4 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: %r198 = uitofp <4 x i64> poison to <4 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: %r199 = sitofp <4 x i64> poison to <4 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r200 = uitofp <4 x i1> poison to <4 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r201 = sitofp <4 x i1> poison to <4 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %r202 = uitofp <4 x i8> poison to <4 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %r203 = sitofp <4 x i8> poison to <4 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %r204 = uitofp <4 x i16> poison to <4 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %r205 = sitofp <4 x i16> poison to <4 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r206 = uitofp <4 x i32> poison to <4 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r207 = sitofp <4 x i32> poison to <4 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r208 = uitofp <4 x i64> poison to <4 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r209 = sitofp <4 x i64> poison to <4 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r210 = uitofp <8 x i1> poison to <8 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r211 = sitofp <8 x i1> poison to <8 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r212 = uitofp <8 x i8> poison to <8 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r213 = sitofp <8 x i8> poison to <8 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r214 = uitofp <8 x i16> poison to <8 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r215 = sitofp <8 x i16> poison to <8 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r216 = uitofp <8 x i32> poison to <8 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r217 = sitofp <8 x i32> poison to <8 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:36 CodeSize:1 Lat:1 SizeLat:1 for: %r218 = uitofp <8 x i64> poison to <8 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:36 CodeSize:1 Lat:1 SizeLat:1 for: %r219 = sitofp <8 x i64> poison to <8 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r220 = uitofp <8 x i1> poison to <8 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r221 = sitofp <8 x i1> poison to <8 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r222 = uitofp <8 x i8> poison to <8 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r223 = sitofp <8 x i8> poison to <8 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r224 = uitofp <8 x i16> poison to <8 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r225 = sitofp <8 x i16> poison to <8 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r226 = uitofp <8 x i16> poison to <8 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r227 = sitofp <8 x i16> poison to <8 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r228 = uitofp <8 x i64> poison to <8 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r229 = sitofp <8 x i64> poison to <8 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r230 = uitofp <16 x i1> poison to <16 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r231 = sitofp <16 x i1> poison to <16 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r232 = uitofp <16 x i8> poison to <16 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r233 = sitofp <16 x i8> poison to <16 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r234 = uitofp <16 x i16> poison to <16 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r235 = sitofp <16 x i16> poison to <16 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r236 = uitofp <16 x i32> poison to <16 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r237 = sitofp <16 x i32> poison to <16 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:72 CodeSize:1 Lat:1 SizeLat:1 for: %r238 = uitofp <16 x i64> poison to <16 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:72 CodeSize:1 Lat:1 SizeLat:1 for: %r239 = sitofp <16 x i64> poison to <16 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %r240 = uitofp <16 x i1> poison to <16 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %r241 = sitofp <16 x i1> poison to <16 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:39 CodeSize:1 Lat:1 SizeLat:1 for: %r242 = uitofp <16 x i8> poison to <16 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:39 CodeSize:1 Lat:1 SizeLat:1 for: %r243 = sitofp <16 x i8> poison to <16 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:38 CodeSize:1 Lat:1 SizeLat:1 for: %r244 = uitofp <16 x i16> poison to <16 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:38 CodeSize:1 Lat:1 SizeLat:1 for: %r245 = sitofp <16 x i16> poison to <16 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:38 CodeSize:1 Lat:1 SizeLat:1 for: %r246 = uitofp <16 x i16> poison to <16 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:38 CodeSize:1 Lat:1 SizeLat:1 for: %r247 = sitofp <16 x i16> poison to <16 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r248 = uitofp <16 x i64> poison to <16 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r249 = sitofp <16 x i64> poison to <16 x double>
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
;
- %r30 = fptoui float undef to i1
- %r31 = fptosi float undef to i1
- %r32 = fptoui float undef to i8
- %r33 = fptosi float undef to i8
- %r34 = fptoui float undef to i16
- %r35 = fptosi float undef to i16
- %r36 = fptoui float undef to i32
- %r37 = fptosi float undef to i32
- %r38 = fptoui float undef to i64
- %r39 = fptosi float undef to i64
- %r40 = fptoui double undef to i1
- %r41 = fptosi double undef to i1
- %r42 = fptoui double undef to i8
- %r43 = fptosi double undef to i8
- %r44 = fptoui double undef to i16
- %r45 = fptosi double undef to i16
- %r46 = fptoui double undef to i32
- %r47 = fptosi double undef to i32
- %r48 = fptoui double undef to i64
- %r49 = fptosi double undef to i64
- %r50 = sitofp i1 undef to float
- %r51 = uitofp i1 undef to float
- %r52 = sitofp i1 undef to double
- %r53 = uitofp i1 undef to double
- %r54 = sitofp i8 undef to float
- %r55 = uitofp i8 undef to float
- %r56 = sitofp i8 undef to double
- %r57 = uitofp i8 undef to double
- %r58 = sitofp i16 undef to float
- %r59 = uitofp i16 undef to float
- %r60 = sitofp i16 undef to double
- %r61 = uitofp i16 undef to double
- %r62 = sitofp i32 undef to float
- %r63 = uitofp i32 undef to float
- %r64 = sitofp i32 undef to double
- %r65 = uitofp i32 undef to double
- %r66 = sitofp i64 undef to float
- %r67 = uitofp i64 undef to float
- %r68 = sitofp i64 undef to double
- %r69 = uitofp i64 undef to double
- %r80 = fptrunc double undef to float
- %r81 = fptrunc <2 x double> undef to <2 x float>
- %r82 = fptrunc <4 x double> undef to <4 x float>
- %r83 = fptrunc <8 x double> undef to <8 x float>
- %r84 = fptrunc <16 x double> undef to <16 x float>
- %truncf64f16 = fptrunc double undef to half
- %truncv2f64f16 = fptrunc <2 x double> undef to <2 x half>
- %truncv4f64f16 = fptrunc <4 x double> undef to <4 x half>
- %truncv8f64f16 = fptrunc <8 x double> undef to <8 x half>
- %truncv16f64f16 = fptrunc <16 x double> undef to <16 x half>
- %truncv32f16 = fptrunc float undef to half
- %truncv2f32f16 = fptrunc <2 x float> undef to <2 x half>
- %truncv4f32f16 = fptrunc <4 x float> undef to <4 x half>
- %truncv8f32f16 = fptrunc <8 x float> undef to <8 x half>
- %truncv16f32f16 = fptrunc <16 x float> undef to <16 x half>
- %r85 = fpext float undef to double
- %r86 = fpext <2 x float> undef to <2 x double>
- %r87 = fpext <4 x float> undef to <4 x double>
- %r88 = fpext <8 x float> undef to <8 x double>
- %r89 = fpext <16 x float> undef to <16 x double>
- %extf16f32 = fpext half undef to float
- %extv2f16f32 = fpext <2 x half> undef to <2 x float>
- %extv4f16f32 = fpext <4 x half> undef to <4 x float>
- %extv8f16f32 = fpext <8 x half> undef to <8 x float>
- %extv16f16f32 = fpext <16 x half> undef to <16 x float>
- %extf16f64 = fpext half undef to double
- %extv2f16f64 = fpext <2 x half> undef to <2 x double>
- %extv4f16f64 = fpext <4 x half> undef to <4 x double>
- %extv8f16f64 = fpext <8 x half> undef to <8 x double>
- %extv16f16f64 = fpext <16 x half> undef to <16 x double>
- %r90 = fptoui <2 x float> undef to <2 x i1>
- %r91 = fptosi <2 x float> undef to <2 x i1>
- %r92 = fptoui <2 x float> undef to <2 x i8>
- %r93 = fptosi <2 x float> undef to <2 x i8>
- %r94 = fptoui <2 x float> undef to <2 x i16>
- %r95 = fptosi <2 x float> undef to <2 x i16>
- %r96 = fptoui <2 x float> undef to <2 x i32>
- %r97 = fptosi <2 x float> undef to <2 x i32>
- %r98 = fptoui <2 x float> undef to <2 x i64>
- %r99 = fptosi <2 x float> undef to <2 x i64>
- %r100 = fptoui <2 x double> undef to <2 x i1>
- %r101 = fptosi <2 x double> undef to <2 x i1>
- %r102 = fptoui <2 x double> undef to <2 x i8>
- %r103 = fptosi <2 x double> undef to <2 x i8>
- %r104 = fptoui <2 x double> undef to <2 x i16>
- %r105 = fptosi <2 x double> undef to <2 x i16>
- %r106 = fptoui <2 x double> undef to <2 x i32>
- %r107 = fptosi <2 x double> undef to <2 x i32>
- %r108 = fptoui <2 x double> undef to <2 x i64>
- %r109 = fptosi <2 x double> undef to <2 x i64>
+ %r30 = fptoui float poison to i1
+ %r31 = fptosi float poison to i1
+ %r32 = fptoui float poison to i8
+ %r33 = fptosi float poison to i8
+ %r34 = fptoui float poison to i16
+ %r35 = fptosi float poison to i16
+ %r36 = fptoui float poison to i32
+ %r37 = fptosi float poison to i32
+ %r38 = fptoui float poison to i64
+ %r39 = fptosi float poison to i64
+ %r40 = fptoui double poison to i1
+ %r41 = fptosi double poison to i1
+ %r42 = fptoui double poison to i8
+ %r43 = fptosi double poison to i8
+ %r44 = fptoui double poison to i16
+ %r45 = fptosi double poison to i16
+ %r46 = fptoui double poison to i32
+ %r47 = fptosi double poison to i32
+ %r48 = fptoui double poison to i64
+ %r49 = fptosi double poison to i64
+ %r50 = sitofp i1 poison to float
+ %r51 = uitofp i1 poison to float
+ %r52 = sitofp i1 poison to double
+ %r53 = uitofp i1 poison to double
+ %r54 = sitofp i8 poison to float
+ %r55 = uitofp i8 poison to float
+ %r56 = sitofp i8 poison to double
+ %r57 = uitofp i8 poison to double
+ %r58 = sitofp i16 poison to float
+ %r59 = uitofp i16 poison to float
+ %r60 = sitofp i16 poison to double
+ %r61 = uitofp i16 poison to double
+ %r62 = sitofp i32 poison to float
+ %r63 = uitofp i32 poison to float
+ %r64 = sitofp i32 poison to double
+ %r65 = uitofp i32 poison to double
+ %r66 = sitofp i64 poison to float
+ %r67 = uitofp i64 poison to float
+ %r68 = sitofp i64 poison to double
+ %r69 = uitofp i64 poison to double
+ %r80 = fptrunc double poison to float
+ %r81 = fptrunc <2 x double> poison to <2 x float>
+ %r82 = fptrunc <4 x double> poison to <4 x float>
+ %r83 = fptrunc <8 x double> poison to <8 x float>
+ %r84 = fptrunc <16 x double> poison to <16 x float>
+ %truncf64f16 = fptrunc double poison to half
+ %truncv2f64f16 = fptrunc <2 x double> poison to <2 x half>
+ %truncv4f64f16 = fptrunc <4 x double> poison to <4 x half>
+ %truncv8f64f16 = fptrunc <8 x double> poison to <8 x half>
+ %truncv16f64f16 = fptrunc <16 x double> poison to <16 x half>
+ %truncv32f16 = fptrunc float poison to half
+ %truncv2f32f16 = fptrunc <2 x float> poison to <2 x half>
+ %truncv4f32f16 = fptrunc <4 x float> poison to <4 x half>
+ %truncv8f32f16 = fptrunc <8 x float> poison to <8 x half>
+ %truncv16f32f16 = fptrunc <16 x float> poison to <16 x half>
+ %r85 = fpext float poison to double
+ %r86 = fpext <2 x float> poison to <2 x double>
+ %r87 = fpext <4 x float> poison to <4 x double>
+ %r88 = fpext <8 x float> poison to <8 x double>
+ %r89 = fpext <16 x float> poison to <16 x double>
+ %extf16f32 = fpext half poison to float
+ %extv2f16f32 = fpext <2 x half> poison to <2 x float>
+ %extv4f16f32 = fpext <4 x half> poison to <4 x float>
+ %extv8f16f32 = fpext <8 x half> poison to <8 x float>
+ %extv16f16f32 = fpext <16 x half> poison to <16 x float>
+ %extf16f64 = fpext half poison to double
+ %extv2f16f64 = fpext <2 x half> poison to <2 x double>
+ %extv4f16f64 = fpext <4 x half> poison to <4 x double>
+ %extv8f16f64 = fpext <8 x half> poison to <8 x double>
+ %extv16f16f64 = fpext <16 x half> poison to <16 x double>
+ %r90 = fptoui <2 x float> poison to <2 x i1>
+ %r91 = fptosi <2 x float> poison to <2 x i1>
+ %r92 = fptoui <2 x float> poison to <2 x i8>
+ %r93 = fptosi <2 x float> poison to <2 x i8>
+ %r94 = fptoui <2 x float> poison to <2 x i16>
+ %r95 = fptosi <2 x float> poison to <2 x i16>
+ %r96 = fptoui <2 x float> poison to <2 x i32>
+ %r97 = fptosi <2 x float> poison to <2 x i32>
+ %r98 = fptoui <2 x float> poison to <2 x i64>
+ %r99 = fptosi <2 x float> poison to <2 x i64>
+ %r100 = fptoui <2 x double> poison to <2 x i1>
+ %r101 = fptosi <2 x double> poison to <2 x i1>
+ %r102 = fptoui <2 x double> poison to <2 x i8>
+ %r103 = fptosi <2 x double> poison to <2 x i8>
+ %r104 = fptoui <2 x double> poison to <2 x i16>
+ %r105 = fptosi <2 x double> poison to <2 x i16>
+ %r106 = fptoui <2 x double> poison to <2 x i32>
+ %r107 = fptosi <2 x double> poison to <2 x i32>
+ %r108 = fptoui <2 x double> poison to <2 x i64>
+ %r109 = fptosi <2 x double> poison to <2 x i64>
- %r110 = fptoui <4 x float> undef to <4 x i1>
- %r111 = fptosi <4 x float> undef to <4 x i1>
- %r112 = fptoui <4 x float> undef to <4 x i8>
- %r113 = fptosi <4 x float> undef to <4 x i8>
- %r114 = fptoui <4 x float> undef to <4 x i16>
- %r115 = fptosi <4 x float> undef to <4 x i16>
- %r116 = fptoui <4 x float> undef to <4 x i32>
- %r117 = fptosi <4 x float> undef to <4 x i32>
- %r118 = fptoui <4 x float> undef to <4 x i64>
- %r119 = fptosi <4 x float> undef to <4 x i64>
+ %r110 = fptoui <4 x float> poison to <4 x i1>
+ %r111 = fptosi <4 x float> poison to <4 x i1>
+ %r112 = fptoui <4 x float> poison to <4 x i8>
+ %r113 = fptosi <4 x float> poison to <4 x i8>
+ %r114 = fptoui <4 x float> poison to <4 x i16>
+ %r115 = fptosi <4 x float> poison to <4 x i16>
+ %r116 = fptoui <4 x float> poison to <4 x i32>
+ %r117 = fptosi <4 x float> poison to <4 x i32>
+ %r118 = fptoui <4 x float> poison to <4 x i64>
+ %r119 = fptosi <4 x float> poison to <4 x i64>
- %r120 = fptoui <4 x double> undef to <4 x i1>
- %r121 = fptosi <4 x double> undef to <4 x i1>
- %r122 = fptoui <4 x double> undef to <4 x i8>
- %r123 = fptosi <4 x double> undef to <4 x i8>
- %r124 = fptoui <4 x double> undef to <4 x i16>
- %r125 = fptosi <4 x double> undef to <4 x i16>
- %r126 = fptoui <4 x double> undef to <4 x i32>
- %r127 = fptosi <4 x double> undef to <4 x i32>
- %r128 = fptoui <4 x double> undef to <4 x i64>
- %r129 = fptosi <4 x double> undef to <4 x i64>
+ %r120 = fptoui <4 x double> poison to <4 x i1>
+ %r121 = fptosi <4 x double> poison to <4 x i1>
+ %r122 = fptoui <4 x double> poison to <4 x i8>
+ %r123 = fptosi <4 x double> poison to <4 x i8>
+ %r124 = fptoui <4 x double> poison to <4 x i16>
+ %r125 = fptosi <4 x double> poison to <4 x i16>
+ %r126 = fptoui <4 x double> poison to <4 x i32>
+ %r127 = fptosi <4 x double> poison to <4 x i32>
+ %r128 = fptoui <4 x double> poison to <4 x i64>
+ %r129 = fptosi <4 x double> poison to <4 x i64>
- %r130 = fptoui <8 x float> undef to <8 x i1>
- %r131 = fptosi <8 x float> undef to <8 x i1>
- %r132 = fptoui <8 x float> undef to <8 x i8>
- %r133 = fptosi <8 x float> undef to <8 x i8>
- %r134 = fptoui <8 x float> undef to <8 x i16>
- %r135 = fptosi <8 x float> undef to <8 x i16>
- %r136 = fptoui <8 x float> undef to <8 x i32>
- %r137 = fptosi <8 x float> undef to <8 x i32>
- %r138 = fptoui <8 x float> undef to <8 x i64>
- %r139 = fptosi <8 x float> undef to <8 x i64>
+ %r130 = fptoui <8 x float> poison to <8 x i1>
+ %r131 = fptosi <8 x float> poison to <8 x i1>
+ %r132 = fptoui <8 x float> poison to <8 x i8>
+ %r133 = fptosi <8 x float> poison to <8 x i8>
+ %r134 = fptoui <8 x float> poison to <8 x i16>
+ %r135 = fptosi <8 x float> poison to <8 x i16>
+ %r136 = fptoui <8 x float> poison to <8 x i32>
+ %r137 = fptosi <8 x float> poison to <8 x i32>
+ %r138 = fptoui <8 x float> poison to <8 x i64>
+ %r139 = fptosi <8 x float> poison to <8 x i64>
- %r140 = fptoui <8 x double> undef to <8 x i1>
- %r141 = fptosi <8 x double> undef to <8 x i1>
- %r142 = fptoui <8 x double> undef to <8 x i8>
- %r143 = fptosi <8 x double> undef to <8 x i8>
- %r144 = fptoui <8 x double> undef to <8 x i16>
- %r145 = fptosi <8 x double> undef to <8 x i16>
- %r146 = fptoui <8 x double> undef to <8 x i32>
- %r147 = fptosi <8 x double> undef to <8 x i32>
- %r148 = fptoui <8 x double> undef to <8 x i64>
- %r149 = fptosi <8 x double> undef to <8 x i64>
+ %r140 = fptoui <8 x double> poison to <8 x i1>
+ %r141 = fptosi <8 x double> poison to <8 x i1>
+ %r142 = fptoui <8 x double> poison to <8 x i8>
+ %r143 = fptosi <8 x double> poison to <8 x i8>
+ %r144 = fptoui <8 x double> poison to <8 x i16>
+ %r145 = fptosi <8 x double> poison to <8 x i16>
+ %r146 = fptoui <8 x double> poison to <8 x i32>
+ %r147 = fptosi <8 x double> poison to <8 x i32>
+ %r148 = fptoui <8 x double> poison to <8 x i64>
+ %r149 = fptosi <8 x double> poison to <8 x i64>
- %r150 = fptoui <16 x float> undef to <16 x i1>
- %r151 = fptosi <16 x float> undef to <16 x i1>
- %r152 = fptoui <16 x float> undef to <16 x i8>
- %r153 = fptosi <16 x float> undef to <16 x i8>
- %r154 = fptoui <16 x float> undef to <16 x i16>
- %r155 = fptosi <16 x float> undef to <16 x i16>
- %r156 = fptoui <16 x float> undef to <16 x i32>
- %r157 = fptosi <16 x float> undef to <16 x i32>
- %r158 = fptoui <16 x float> undef to <16 x i64>
- %r159 = fptosi <16 x float> undef to <16 x i64>
+ %r150 = fptoui <16 x float> poison to <16 x i1>
+ %r151 = fptosi <16 x float> poison to <16 x i1>
+ %r152 = fptoui <16 x float> poison to <16 x i8>
+ %r153 = fptosi <16 x float> poison to <16 x i8>
+ %r154 = fptoui <16 x float> poison to <16 x i16>
+ %r155 = fptosi <16 x float> poison to <16 x i16>
+ %r156 = fptoui <16 x float> poison to <16 x i32>
+ %r157 = fptosi <16 x float> poison to <16 x i32>
+ %r158 = fptoui <16 x float> poison to <16 x i64>
+ %r159 = fptosi <16 x float> poison to <16 x i64>
- %r160 = fptoui <16 x double> undef to <16 x i1>
- %r161 = fptosi <16 x double> undef to <16 x i1>
- %r162 = fptoui <16 x double> undef to <16 x i8>
- %r163 = fptosi <16 x double> undef to <16 x i8>
- %r164 = fptoui <16 x double> undef to <16 x i16>
- %r165 = fptosi <16 x double> undef to <16 x i16>
- %r166 = fptoui <16 x double> undef to <16 x i32>
- %r167 = fptosi <16 x double> undef to <16 x i32>
- %r168 = fptoui <16 x double> undef to <16 x i64>
- %r169 = fptosi <16 x double> undef to <16 x i64>
+ %r160 = fptoui <16 x double> poison to <16 x i1>
+ %r161 = fptosi <16 x double> poison to <16 x i1>
+ %r162 = fptoui <16 x double> poison to <16 x i8>
+ %r163 = fptosi <16 x double> poison to <16 x i8>
+ %r164 = fptoui <16 x double> poison to <16 x i16>
+ %r165 = fptosi <16 x double> poison to <16 x i16>
+ %r166 = fptoui <16 x double> poison to <16 x i32>
+ %r167 = fptosi <16 x double> poison to <16 x i32>
+ %r168 = fptoui <16 x double> poison to <16 x i64>
+ %r169 = fptosi <16 x double> poison to <16 x i64>
- %r170 = uitofp <2 x i1> undef to <2 x float>
- %r171 = sitofp <2 x i1> undef to <2 x float>
- %r172 = uitofp <2 x i8> undef to <2 x float>
- %r173 = sitofp <2 x i8> undef to <2 x float>
- %r174 = uitofp <2 x i16> undef to <2 x float>
- %r175 = sitofp <2 x i16> undef to <2 x float>
- %r176 = uitofp <2 x i32> undef to <2 x float>
- %r177 = sitofp <2 x i32> undef to <2 x float>
- %r178 = uitofp <2 x i64> undef to <2 x float>
- %r179 = sitofp <2 x i64> undef to <2 x float>
+ %r170 = uitofp <2 x i1> poison to <2 x float>
+ %r171 = sitofp <2 x i1> poison to <2 x float>
+ %r172 = uitofp <2 x i8> poison to <2 x float>
+ %r173 = sitofp <2 x i8> poison to <2 x float>
+ %r174 = uitofp <2 x i16> poison to <2 x float>
+ %r175 = sitofp <2 x i16> poison to <2 x float>
+ %r176 = uitofp <2 x i32> poison to <2 x float>
+ %r177 = sitofp <2 x i32> poison to <2 x float>
+ %r178 = uitofp <2 x i64> poison to <2 x float>
+ %r179 = sitofp <2 x i64> poison to <2 x float>
- %r180 = uitofp <2 x i1> undef to <2 x double>
- %r181 = sitofp <2 x i1> undef to <2 x double>
- %r182 = uitofp <2 x i8> undef to <2 x double>
- %r183 = sitofp <2 x i8> undef to <2 x double>
- %r184 = uitofp <2 x i16> undef to <2 x double>
- %r185 = sitofp <2 x i16> undef to <2 x double>
- %r186 = uitofp <2 x i32> undef to <2 x double>
- %r187 = sitofp <2 x i32> undef to <2 x double>
- %r188 = uitofp <2 x i64> undef to <2 x double>
- %r189 = sitofp <2 x i64> undef to <2 x double>
+ %r180 = uitofp <2 x i1> poison to <2 x double>
+ %r181 = sitofp <2 x i1> poison to <2 x double>
+ %r182 = uitofp <2 x i8> poison to <2 x double>
+ %r183 = sitofp <2 x i8> poison to <2 x double>
+ %r184 = uitofp <2 x i16> poison to <2 x double>
+ %r185 = sitofp <2 x i16> poison to <2 x double>
+ %r186 = uitofp <2 x i32> poison to <2 x double>
+ %r187 = sitofp <2 x i32> poison to <2 x double>
+ %r188 = uitofp <2 x i64> poison to <2 x double>
+ %r189 = sitofp <2 x i64> poison to <2 x double>
- %r190 = uitofp <4 x i1> undef to <4 x float>
- %r191 = sitofp <4 x i1> undef to <4 x float>
- %r192 = uitofp <4 x i8> undef to <4 x float>
- %r193 = sitofp <4 x i8> undef to <4 x float>
- %r194 = uitofp <4 x i16> undef to <4 x float>
- %r195 = sitofp <4 x i16> undef to <4 x float>
- %r196 = uitofp <4 x i32> undef to <4 x float>
- %r197 = sitofp <4 x i32> undef to <4 x float>
- %r198 = uitofp <4 x i64> undef to <4 x float>
- %r199 = sitofp <4 x i64> undef to <4 x float>
+ %r190 = uitofp <4 x i1> poison to <4 x float>
+ %r191 = sitofp <4 x i1> poison to <4 x float>
+ %r192 = uitofp <4 x i8> poison to <4 x float>
+ %r193 = sitofp <4 x i8> poison to <4 x float>
+ %r194 = uitofp <4 x i16> poison to <4 x float>
+ %r195 = sitofp <4 x i16> poison to <4 x float>
+ %r196 = uitofp <4 x i32> poison to <4 x float>
+ %r197 = sitofp <4 x i32> poison to <4 x float>
+ %r198 = uitofp <4 x i64> poison to <4 x float>
+ %r199 = sitofp <4 x i64> poison to <4 x float>
- %r200 = uitofp <4 x i1> undef to <4 x double>
- %r201 = sitofp <4 x i1> undef to <4 x double>
- %r202 = uitofp <4 x i8> undef to <4 x double>
- %r203 = sitofp <4 x i8> undef to <4 x double>
- %r204 = uitofp <4 x i16> undef to <4 x double>
- %r205 = sitofp <4 x i16> undef to <4 x double>
- %r206 = uitofp <4 x i32> undef to <4 x double>
- %r207 = sitofp <4 x i32> undef to <4 x double>
- %r208 = uitofp <4 x i64> undef to <4 x double>
- %r209 = sitofp <4 x i64> undef to <4 x double>
+ %r200 = uitofp <4 x i1> poison to <4 x double>
+ %r201 = sitofp <4 x i1> poison to <4 x double>
+ %r202 = uitofp <4 x i8> poison to <4 x double>
+ %r203 = sitofp <4 x i8> poison to <4 x double>
+ %r204 = uitofp <4 x i16> poison to <4 x double>
+ %r205 = sitofp <4 x i16> poison to <4 x double>
+ %r206 = uitofp <4 x i32> poison to <4 x double>
+ %r207 = sitofp <4 x i32> poison to <4 x double>
+ %r208 = uitofp <4 x i64> poison to <4 x double>
+ %r209 = sitofp <4 x i64> poison to <4 x double>
- %r210 = uitofp <8 x i1> undef to <8 x float>
- %r211 = sitofp <8 x i1> undef to <8 x float>
- %r212 = uitofp <8 x i8> undef to <8 x float>
- %r213 = sitofp <8 x i8> undef to <8 x float>
- %r214 = uitofp <8 x i16> undef to <8 x float>
- %r215 = sitofp <8 x i16> undef to <8 x float>
- %r216 = uitofp <8 x i32> undef to <8 x float>
- %r217 = sitofp <8 x i32> undef to <8 x float>
- %r218 = uitofp <8 x i64> undef to <8 x float>
- %r219 = sitofp <8 x i64> undef to <8 x float>
+ %r210 = uitofp <8 x i1> poison to <8 x float>
+ %r211 = sitofp <8 x i1> poison to <8 x float>
+ %r212 = uitofp <8 x i8> poison to <8 x float>
+ %r213 = sitofp <8 x i8> poison to <8 x float>
+ %r214 = uitofp <8 x i16> poison to <8 x float>
+ %r215 = sitofp <8 x i16> poison to <8 x float>
+ %r216 = uitofp <8 x i32> poison to <8 x float>
+ %r217 = sitofp <8 x i32> poison to <8 x float>
+ %r218 = uitofp <8 x i64> poison to <8 x float>
+ %r219 = sitofp <8 x i64> poison to <8 x float>
- %r220 = uitofp <8 x i1> undef to <8 x double>
- %r221 = sitofp <8 x i1> undef to <8 x double>
- %r222 = uitofp <8 x i8> undef to <8 x double>
- %r223 = sitofp <8 x i8> undef to <8 x double>
- %r224 = uitofp <8 x i16> undef to <8 x double>
- %r225 = sitofp <8 x i16> undef to <8 x double>
- %r226 = uitofp <8 x i16> undef to <8 x double>
- %r227 = sitofp <8 x i16> undef to <8 x double>
- %r228 = uitofp <8 x i64> undef to <8 x double>
- %r229 = sitofp <8 x i64> undef to <8 x double>
+ %r220 = uitofp <8 x i1> poison to <8 x double>
+ %r221 = sitofp <8 x i1> poison to <8 x double>
+ %r222 = uitofp <8 x i8> poison to <8 x double>
+ %r223 = sitofp <8 x i8> poison to <8 x double>
+ %r224 = uitofp <8 x i16> poison to <8 x double>
+ %r225 = sitofp <8 x i16> poison to <8 x double>
+ %r226 = uitofp <8 x i16> poison to <8 x double>
+ %r227 = sitofp <8 x i16> poison to <8 x double>
+ %r228 = uitofp <8 x i64> poison to <8 x double>
+ %r229 = sitofp <8 x i64> poison to <8 x double>
- %r230 = uitofp <16 x i1> undef to <16 x float>
- %r231 = sitofp <16 x i1> undef to <16 x float>
- %r232 = uitofp <16 x i8> undef to <16 x float>
- %r233 = sitofp <16 x i8> undef to <16 x float>
- %r234 = uitofp <16 x i16> undef to <16 x float>
- %r235 = sitofp <16 x i16> undef to <16 x float>
- %r236 = uitofp <16 x i32> undef to <16 x float>
- %r237 = sitofp <16 x i32> undef to <16 x float>
- %r238 = uitofp <16 x i64> undef to <16 x float>
- %r239 = sitofp <16 x i64> undef to <16 x float>
+ %r230 = uitofp <16 x i1> poison to <16 x float>
+ %r231 = sitofp <16 x i1> poison to <16 x float>
+ %r232 = uitofp <16 x i8> poison to <16 x float>
+ %r233 = sitofp <16 x i8> poison to <16 x float>
+ %r234 = uitofp <16 x i16> poison to <16 x float>
+ %r235 = sitofp <16 x i16> poison to <16 x float>
+ %r236 = uitofp <16 x i32> poison to <16 x float>
+ %r237 = sitofp <16 x i32> poison to <16 x float>
+ %r238 = uitofp <16 x i64> poison to <16 x float>
+ %r239 = sitofp <16 x i64> poison to <16 x float>
- %r240 = uitofp <16 x i1> undef to <16 x double>
- %r241 = sitofp <16 x i1> undef to <16 x double>
- %r242 = uitofp <16 x i8> undef to <16 x double>
- %r243 = sitofp <16 x i8> undef to <16 x double>
- %r244 = uitofp <16 x i16> undef to <16 x double>
- %r245 = sitofp <16 x i16> undef to <16 x double>
- %r246 = uitofp <16 x i16> undef to <16 x double>
- %r247 = sitofp <16 x i16> undef to <16 x double>
- %r248 = uitofp <16 x i64> undef to <16 x double>
- %r249 = sitofp <16 x i64> undef to <16 x double>
+ %r240 = uitofp <16 x i1> poison to <16 x double>
+ %r241 = sitofp <16 x i1> poison to <16 x double>
+ %r242 = uitofp <16 x i8> poison to <16 x double>
+ %r243 = sitofp <16 x i8> poison to <16 x double>
+ %r244 = uitofp <16 x i16> poison to <16 x double>
+ %r245 = sitofp <16 x i16> poison to <16 x double>
+ %r246 = uitofp <16 x i16> poison to <16 x double>
+ %r247 = sitofp <16 x i16> poison to <16 x double>
+ %r248 = uitofp <16 x i64> poison to <16 x double>
+ %r249 = sitofp <16 x i64> poison to <16 x double>
ret i32 undef
}
@@ -836,24 +836,24 @@ define i32 @casts_with_users(i8 %a, i16 %b, i32 %c, i64 %d, i1 %e) {
define i32 @bitcasts() {
; CHECK-LABEL: 'bitcasts'
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %a = bitcast i32 undef to i32
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %b = bitcast float undef to float
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %c = bitcast i32 undef to float
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %d = bitcast float undef to i32
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %e = bitcast i64 undef to double
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %f = bitcast double undef to i64
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %g = bitcast half undef to i16
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %h = bitcast i16 undef to half
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %a = bitcast i32 poison to i32
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %b = bitcast float poison to float
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %c = bitcast i32 poison to float
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %d = bitcast float poison to i32
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %e = bitcast i64 poison to double
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %f = bitcast double poison to i64
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %g = bitcast half poison to i16
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %h = bitcast i16 poison to half
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
;
- %a = bitcast i32 undef to i32
- %b = bitcast float undef to float
- %c = bitcast i32 undef to float
- %d = bitcast float undef to i32
- %e = bitcast i64 undef to double
- %f = bitcast double undef to i64
- %g = bitcast half undef to i16
- %h = bitcast i16 undef to half
+ %a = bitcast i32 poison to i32
+ %b = bitcast float poison to float
+ %c = bitcast i32 poison to float
+ %d = bitcast float poison to i32
+ %e = bitcast i64 poison to double
+ %f = bitcast double poison to i64
+ %g = bitcast half poison to i16
+ %h = bitcast i16 poison to half
ret i32 undef
}
@@ -941,31 +941,31 @@ define i32 @load_extends() {
define i32 @store_truncs() {
; CHECK-LABEL: 'store_truncs'
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r0 = trunc i64 undef to i8
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r0 = trunc i64 poison to i8
; CHECK-NEXT: Cost Model: Found costs of 1 for: store i8 %r0, ptr undef, align 1
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r1 = trunc i64 undef to i16
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r1 = trunc i64 poison to i16
; CHECK-NEXT: Cost Model: Found costs of 1 for: store i16 %r1, ptr undef, align 2
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r2 = trunc i64 undef to i32
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r2 = trunc i64 poison to i32
; CHECK-NEXT: Cost Model: Found costs of 1 for: store i32 %r2, ptr undef, align 4
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r3 = trunc i32 undef to i8
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r3 = trunc i32 poison to i8
; CHECK-NEXT: Cost Model: Found costs of 1 for: store i8 %r3, ptr undef, align 1
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r4 = trunc i32 undef to i16
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r4 = trunc i32 poison to i16
; CHECK-NEXT: Cost Model: Found costs of 1 for: store i16 %r4, ptr undef, align 2
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r5 = trunc i16 undef to i8
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r5 = trunc i16 poison to i8
; CHECK-NEXT: Cost Model: Found costs of 1 for: store i8 %r5, ptr undef, align 1
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
;
- %r0 = trunc i64 undef to i8
+ %r0 = trunc i64 poison to i8
store i8 %r0, ptr undef
- %r1 = trunc i64 undef to i16
+ %r1 = trunc i64 poison to i16
store i16 %r1, ptr undef
- %r2 = trunc i64 undef to i32
+ %r2 = trunc i64 poison to i32
store i32 %r2, ptr undef
- %r3 = trunc i32 undef to i8
+ %r3 = trunc i32 poison to i8
store i8 %r3, ptr undef
- %r4 = trunc i32 undef to i16
+ %r4 = trunc i32 poison to i16
store i16 %r4, ptr undef
- %r5 = trunc i16 undef to i8
+ %r5 = trunc i16 poison to i8
store i8 %r5, ptr undef
ret i32 undef
}
@@ -1013,296 +1013,296 @@ declare void @use(i16, i16, i32, i32, i64, i64, i32, i32, i64, i64, i64, i64)
define void @fp16cast() {
; CHECK-NOFP16-LABEL: 'fp16cast'
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r30 = fptoui half undef to i1
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r31 = fptosi half undef to i1
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r32 = fptoui half undef to i8
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r33 = fptosi half undef to i8
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r34 = fptoui half undef to i16
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r35 = fptosi half undef to i16
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r36 = fptoui half undef to i32
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r37 = fptosi half undef to i32
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r38 = fptoui half undef to i64
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r39 = fptosi half undef to i64
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r90 = fptoui <2 x half> undef to <2 x i1>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r91 = fptosi <2 x half> undef to <2 x i1>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r92 = fptoui <2 x half> undef to <2 x i8>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r93 = fptosi <2 x half> undef to <2 x i8>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r94 = fptoui <2 x half> undef to <2 x i16>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r95 = fptosi <2 x half> undef to <2 x i16>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r96 = fptoui <2 x half> undef to <2 x i32>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r97 = fptosi <2 x half> undef to <2 x i32>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x half> undef to <2 x i64>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x half> undef to <2 x i64>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r110 = fptoui <4 x half> undef to <4 x i1>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r111 = fptosi <4 x half> undef to <4 x i1>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r112 = fptoui <4 x half> undef to <4 x i8>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r113 = fptosi <4 x half> undef to <4 x i8>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r114 = fptoui <4 x half> undef to <4 x i16>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r115 = fptosi <4 x half> undef to <4 x i16>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r116 = fptoui <4 x half> undef to <4 x i32>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r117 = fptosi <4 x half> undef to <4 x i32>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r118 = fptoui <4 x half> undef to <4 x i64>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r119 = fptosi <4 x half> undef to <4 x i64>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r130 = fptoui <8 x half> undef to <8 x i1>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r131 = fptosi <8 x half> undef to <8 x i1>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r132 = fptoui <8 x half> undef to <8 x i8>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r133 = fptosi <8 x half> undef to <8 x i8>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r134 = fptoui <8 x half> undef to <8 x i16>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r135 = fptosi <8 x half> undef to <8 x i16>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:41 CodeSize:1 Lat:1 SizeLat:1 for: %r136 = fptoui <8 x half> undef to <8 x i32>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:41 CodeSize:1 Lat:1 SizeLat:1 for: %r137 = fptosi <8 x half> undef to <8 x i32>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x half> undef to <8 x i64>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x half> undef to <8 x i64>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:81 CodeSize:1 Lat:1 SizeLat:1 for: %r150 = fptoui <16 x half> undef to <16 x i1>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:81 CodeSize:1 Lat:1 SizeLat:1 for: %r151 = fptosi <16 x half> undef to <16 x i1>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:81 CodeSize:1 Lat:1 SizeLat:1 for: %r152 = fptoui <16 x half> undef to <16 x i8>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:81 CodeSize:1 Lat:1 SizeLat:1 for: %r153 = fptosi <16 x half> undef to <16 x i8>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r154 = fptoui <16 x half> undef to <16 x i16>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r155 = fptosi <16 x half> undef to <16 x i16>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:82 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x half> undef to <16 x i32>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:82 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x half> undef to <16 x i32>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:86 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x half> undef to <16 x i64>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:86 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x half> undef to <16 x i64>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r250 = uitofp <8 x i1> undef to <8 x half>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r251 = sitofp <8 x i1> undef to <8 x half>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r252 = uitofp <8 x i8> undef to <8 x half>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r253 = sitofp <8 x i8> undef to <8 x half>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r254 = uitofp <8 x i16> undef to <8 x half>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r255 = sitofp <8 x i16> undef to <8 x half>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r256 = uitofp <8 x i32> undef to <8 x half>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r257 = sitofp <8 x i32> undef to <8 x half>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r258 = uitofp <8 x i64> undef to <8 x half>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r259 = sitofp <8 x i64> undef to <8 x half>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r260 = uitofp <16 x i1> undef to <16 x half>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r261 = sitofp <16 x i1> undef to <16 x half>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r262 = uitofp <16 x i8> undef to <16 x half>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r263 = sitofp <16 x i8> undef to <16 x half>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r264 = uitofp <16 x i16> undef to <16 x half>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r265 = sitofp <16 x i16> undef to <16 x half>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %r266 = uitofp <16 x i32> undef to <16 x half>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %r267 = sitofp <16 x i32> undef to <16 x half>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %r268 = uitofp <16 x i64> undef to <16 x half>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %r269 = sitofp <16 x i64> undef to <16 x half>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r30 = fptoui half poison to i1
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r31 = fptosi half poison to i1
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r32 = fptoui half poison to i8
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r33 = fptosi half poison to i8
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r34 = fptoui half poison to i16
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r35 = fptosi half poison to i16
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r36 = fptoui half poison to i32
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r37 = fptosi half poison to i32
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r38 = fptoui half poison to i64
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r39 = fptosi half poison to i64
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r90 = fptoui <2 x half> poison to <2 x i1>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r91 = fptosi <2 x half> poison to <2 x i1>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r92 = fptoui <2 x half> poison to <2 x i8>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r93 = fptosi <2 x half> poison to <2 x i8>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r94 = fptoui <2 x half> poison to <2 x i16>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r95 = fptosi <2 x half> poison to <2 x i16>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r96 = fptoui <2 x half> poison to <2 x i32>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r97 = fptosi <2 x half> poison to <2 x i32>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x half> poison to <2 x i64>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x half> poison to <2 x i64>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r110 = fptoui <4 x half> poison to <4 x i1>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r111 = fptosi <4 x half> poison to <4 x i1>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r112 = fptoui <4 x half> poison to <4 x i8>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r113 = fptosi <4 x half> poison to <4 x i8>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r114 = fptoui <4 x half> poison to <4 x i16>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r115 = fptosi <4 x half> poison to <4 x i16>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r116 = fptoui <4 x half> poison to <4 x i32>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r117 = fptosi <4 x half> poison to <4 x i32>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r118 = fptoui <4 x half> poison to <4 x i64>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r119 = fptosi <4 x half> poison to <4 x i64>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r130 = fptoui <8 x half> poison to <8 x i1>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r131 = fptosi <8 x half> poison to <8 x i1>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r132 = fptoui <8 x half> poison to <8 x i8>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r133 = fptosi <8 x half> poison to <8 x i8>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r134 = fptoui <8 x half> poison to <8 x i16>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r135 = fptosi <8 x half> poison to <8 x i16>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:41 CodeSize:1 Lat:1 SizeLat:1 for: %r136 = fptoui <8 x half> poison to <8 x i32>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:41 CodeSize:1 Lat:1 SizeLat:1 for: %r137 = fptosi <8 x half> poison to <8 x i32>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x half> poison to <8 x i64>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x half> poison to <8 x i64>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:81 CodeSize:1 Lat:1 SizeLat:1 for: %r150 = fptoui <16 x half> poison to <16 x i1>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:81 CodeSize:1 Lat:1 SizeLat:1 for: %r151 = fptosi <16 x half> poison to <16 x i1>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:81 CodeSize:1 Lat:1 SizeLat:1 for: %r152 = fptoui <16 x half> poison to <16 x i8>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:81 CodeSize:1 Lat:1 SizeLat:1 for: %r153 = fptosi <16 x half> poison to <16 x i8>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r154 = fptoui <16 x half> poison to <16 x i16>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r155 = fptosi <16 x half> poison to <16 x i16>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:82 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x half> poison to <16 x i32>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:82 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x half> poison to <16 x i32>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:86 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x half> poison to <16 x i64>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:86 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x half> poison to <16 x i64>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r250 = uitofp <8 x i1> poison to <8 x half>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r251 = sitofp <8 x i1> poison to <8 x half>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r252 = uitofp <8 x i8> poison to <8 x half>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r253 = sitofp <8 x i8> poison to <8 x half>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r254 = uitofp <8 x i16> poison to <8 x half>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %r255 = sitofp <8 x i16> poison to <8 x half>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r256 = uitofp <8 x i32> poison to <8 x half>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r257 = sitofp <8 x i32> poison to <8 x half>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r258 = uitofp <8 x i64> poison to <8 x half>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r259 = sitofp <8 x i64> poison to <8 x half>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r260 = uitofp <16 x i1> poison to <16 x half>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r261 = sitofp <16 x i1> poison to <16 x half>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r262 = uitofp <16 x i8> poison to <16 x half>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r263 = sitofp <16 x i8> poison to <16 x half>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r264 = uitofp <16 x i16> poison to <16 x half>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r265 = sitofp <16 x i16> poison to <16 x half>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %r266 = uitofp <16 x i32> poison to <16 x half>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %r267 = sitofp <16 x i32> poison to <16 x half>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %r268 = uitofp <16 x i64> poison to <16 x half>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %r269 = sitofp <16 x i64> poison to <16 x half>
; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-FP16-LABEL: 'fp16cast'
-; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r30 = fptoui half undef to i1
-; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r31 = fptosi half undef to i1
-; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r32 = fptoui half undef to i8
-; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r33 = fptosi half undef to i8
-; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r34 = fptoui half undef to i16
-; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r35 = fptosi half undef to i16
-; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r36 = fptoui half undef to i32
-; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r37 = fptosi half undef to i32
-; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r38 = fptoui half undef to i64
-; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r39 = fptosi half undef to i64
-; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r90 = fptoui <2 x half> undef to <2 x i1>
-; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r91 = fptosi <2 x half> undef to <2 x i1>
-; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r92 = fptoui <2 x half> undef to <2 x i8>
-; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r93 = fptosi <2 x half> undef to <2 x i8>
-; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r94 = fptoui <2 x half> undef to <2 x i16>
-; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r95 = fptosi <2 x half> undef to <2 x i16>
-; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r96 = fptoui <2 x half> undef to <2 x i32>
-; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r97 = fptosi <2 x half> undef to <2 x i32>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x half> undef to <2 x i64>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x half> undef to <2 x i64>
-; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r110 = fptoui <4 x half> undef to <4 x i1>
-; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r111 = fptosi <4 x half> undef to <4 x i1>
-; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r112 = fptoui <4 x half> undef to <4 x i8>
-; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r113 = fptosi <4 x half> undef to <4 x i8>
-; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r114 = fptoui <4 x half> undef to <4 x i16>
-; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r115 = fptosi <4 x half> undef to <4 x i16>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r116 = fptoui <4 x half> undef to <4 x i32>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r117 = fptosi <4 x half> undef to <4 x i32>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r118 = fptoui <4 x half> undef to <4 x i64>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r119 = fptosi <4 x half> undef to <4 x i64>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r130 = fptoui <8 x half> undef to <8 x i1>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r131 = fptosi <8 x half> undef to <8 x i1>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r132 = fptoui <8 x half> undef to <8 x i8>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r133 = fptosi <8 x half> undef to <8 x i8>
-; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r134 = fptoui <8 x half> undef to <8 x i16>
-; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r135 = fptosi <8 x half> undef to <8 x i16>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r136 = fptoui <8 x half> undef to <8 x i32>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r137 = fptosi <8 x half> undef to <8 x i32>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x half> undef to <8 x i64>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x half> undef to <8 x i64>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:81 CodeSize:1 Lat:1 SizeLat:1 for: %r150 = fptoui <16 x half> undef to <16 x i1>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:81 CodeSize:1 Lat:1 SizeLat:1 for: %r151 = fptosi <16 x half> undef to <16 x i1>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r152 = fptoui <16 x half> undef to <16 x i8>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r153 = fptosi <16 x half> undef to <16 x i8>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r154 = fptoui <16 x half> undef to <16 x i16>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r155 = fptosi <16 x half> undef to <16 x i16>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x half> undef to <16 x i32>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x half> undef to <16 x i32>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:86 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x half> undef to <16 x i64>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:86 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x half> undef to <16 x i64>
-; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r250 = uitofp <8 x i1> undef to <8 x half>
-; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r251 = sitofp <8 x i1> undef to <8 x half>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r252 = uitofp <8 x i8> undef to <8 x half>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r253 = sitofp <8 x i8> undef to <8 x half>
-; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r254 = uitofp <8 x i16> undef to <8 x half>
-; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r255 = sitofp <8 x i16> undef to <8 x half>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r256 = uitofp <8 x i32> undef to <8 x half>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r257 = sitofp <8 x i32> undef to <8 x half>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r258 = uitofp <8 x i64> undef to <8 x half>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r259 = sitofp <8 x i64> undef to <8 x half>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r260 = uitofp <16 x i1> undef to <16 x half>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r261 = sitofp <16 x i1> undef to <16 x half>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r262 = uitofp <16 x i8> undef to <16 x half>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r263 = sitofp <16 x i8> undef to <16 x half>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r264 = uitofp <16 x i16> undef to <16 x half>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r265 = sitofp <16 x i16> undef to <16 x half>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %r266 = uitofp <16 x i32> undef to <16 x half>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %r267 = sitofp <16 x i32> undef to <16 x half>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %r268 = uitofp <16 x i64> undef to <16 x half>
-; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %r269 = sitofp <16 x i64> undef to <16 x half>
+; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r30 = fptoui half poison to i1
+; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r31 = fptosi half poison to i1
+; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r32 = fptoui half poison to i8
+; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r33 = fptosi half poison to i8
+; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r34 = fptoui half poison to i16
+; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r35 = fptosi half poison to i16
+; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r36 = fptoui half poison to i32
+; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r37 = fptosi half poison to i32
+; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r38 = fptoui half poison to i64
+; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r39 = fptosi half poison to i64
+; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r90 = fptoui <2 x half> poison to <2 x i1>
+; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r91 = fptosi <2 x half> poison to <2 x i1>
+; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r92 = fptoui <2 x half> poison to <2 x i8>
+; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r93 = fptosi <2 x half> poison to <2 x i8>
+; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r94 = fptoui <2 x half> poison to <2 x i16>
+; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r95 = fptosi <2 x half> poison to <2 x i16>
+; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r96 = fptoui <2 x half> poison to <2 x i32>
+; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r97 = fptosi <2 x half> poison to <2 x i32>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x half> poison to <2 x i64>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x half> poison to <2 x i64>
+; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r110 = fptoui <4 x half> poison to <4 x i1>
+; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r111 = fptosi <4 x half> poison to <4 x i1>
+; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r112 = fptoui <4 x half> poison to <4 x i8>
+; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r113 = fptosi <4 x half> poison to <4 x i8>
+; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r114 = fptoui <4 x half> poison to <4 x i16>
+; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r115 = fptosi <4 x half> poison to <4 x i16>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r116 = fptoui <4 x half> poison to <4 x i32>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r117 = fptosi <4 x half> poison to <4 x i32>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r118 = fptoui <4 x half> poison to <4 x i64>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r119 = fptosi <4 x half> poison to <4 x i64>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r130 = fptoui <8 x half> poison to <8 x i1>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r131 = fptosi <8 x half> poison to <8 x i1>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r132 = fptoui <8 x half> poison to <8 x i8>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r133 = fptosi <8 x half> poison to <8 x i8>
+; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r134 = fptoui <8 x half> poison to <8 x i16>
+; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r135 = fptosi <8 x half> poison to <8 x i16>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r136 = fptoui <8 x half> poison to <8 x i32>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r137 = fptosi <8 x half> poison to <8 x i32>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x half> poison to <8 x i64>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x half> poison to <8 x i64>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:81 CodeSize:1 Lat:1 SizeLat:1 for: %r150 = fptoui <16 x half> poison to <16 x i1>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:81 CodeSize:1 Lat:1 SizeLat:1 for: %r151 = fptosi <16 x half> poison to <16 x i1>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r152 = fptoui <16 x half> poison to <16 x i8>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r153 = fptosi <16 x half> poison to <16 x i8>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r154 = fptoui <16 x half> poison to <16 x i16>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r155 = fptosi <16 x half> poison to <16 x i16>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x half> poison to <16 x i32>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x half> poison to <16 x i32>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:86 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x half> poison to <16 x i64>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:86 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x half> poison to <16 x i64>
+; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r250 = uitofp <8 x i1> poison to <8 x half>
+; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r251 = sitofp <8 x i1> poison to <8 x half>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r252 = uitofp <8 x i8> poison to <8 x half>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r253 = sitofp <8 x i8> poison to <8 x half>
+; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r254 = uitofp <8 x i16> poison to <8 x half>
+; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %r255 = sitofp <8 x i16> poison to <8 x half>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r256 = uitofp <8 x i32> poison to <8 x half>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r257 = sitofp <8 x i32> poison to <8 x half>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r258 = uitofp <8 x i64> poison to <8 x half>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r259 = sitofp <8 x i64> poison to <8 x half>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r260 = uitofp <16 x i1> poison to <16 x half>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r261 = sitofp <16 x i1> poison to <16 x half>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r262 = uitofp <16 x i8> poison to <16 x half>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r263 = sitofp <16 x i8> poison to <16 x half>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r264 = uitofp <16 x i16> poison to <16 x half>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r265 = sitofp <16 x i16> poison to <16 x half>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %r266 = uitofp <16 x i32> poison to <16 x half>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %r267 = sitofp <16 x i32> poison to <16 x half>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %r268 = uitofp <16 x i64> poison to <16 x half>
+; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %r269 = sitofp <16 x i64> poison to <16 x half>
; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
- %r30 = fptoui half undef to i1
- %r31 = fptosi half undef to i1
- %r32 = fptoui half undef to i8
- %r33 = fptosi half undef to i8
- %r34 = fptoui half undef to i16
- %r35 = fptosi half undef to i16
- %r36 = fptoui half undef to i32
- %r37 = fptosi half undef to i32
- %r38 = fptoui half undef to i64
- %r39 = fptosi half undef to i64
+ %r30 = fptoui half poison to i1
+ %r31 = fptosi half poison to i1
+ %r32 = fptoui half poison to i8
+ %r33 = fptosi half poison to i8
+ %r34 = fptoui half poison to i16
+ %r35 = fptosi half poison to i16
+ %r36 = fptoui half poison to i32
+ %r37 = fptosi half poison to i32
+ %r38 = fptoui half poison to i64
+ %r39 = fptosi half poison to i64
- %r90 = fptoui <2 x half> undef to <2 x i1>
- %r91 = fptosi <2 x half> undef to <2 x i1>
- %r92 = fptoui <2 x half> undef to <2 x i8>
- %r93 = fptosi <2 x half> undef to <2 x i8>
- %r94 = fptoui <2 x half> undef to <2 x i16>
- %r95 = fptosi <2 x half> undef to <2 x i16>
- %r96 = fptoui <2 x half> undef to <2 x i32>
- %r97 = fptosi <2 x half> undef to <2 x i32>
- %r98 = fptoui <2 x half> undef to <2 x i64>
- %r99 = fptosi <2 x half> undef to <2 x i64>
+ %r90 = fptoui <2 x half> poison to <2 x i1>
+ %r91 = fptosi <2 x half> poison to <2 x i1>
+ %r92 = fptoui <2 x half> poison to <2 x i8>
+ %r93 = fptosi <2 x half> poison to <2 x i8>
+ %r94 = fptoui <2 x half> poison to <2 x i16>
+ %r95 = fptosi <2 x half> poison to <2 x i16>
+ %r96 = fptoui <2 x half> poison to <2 x i32>
+ %r97 = fptosi <2 x half> poison to <2 x i32>
+ %r98 = fptoui <2 x half> poison to <2 x i64>
+ %r99 = fptosi <2 x half> poison to <2 x i64>
- %r110 = fptoui <4 x half> undef to <4 x i1>
- %r111 = fptosi <4 x half> undef to <4 x i1>
- %r112 = fptoui <4 x half> undef to <4 x i8>
- %r113 = fptosi <4 x half> undef to <4 x i8>
- %r114 = fptoui <4 x half> undef to <4 x i16>
- %r115 = fptosi <4 x half> undef to <4 x i16>
- %r116 = fptoui <4 x half> undef to <4 x i32>
- %r117 = fptosi <4 x half> undef to <4 x i32>
- %r118 = fptoui <4 x half> undef to <4 x i64>
- %r119 = fptosi <4 x half> undef to <4 x i64>
+ %r110 = fptoui <4 x half> poison to <4 x i1>
+ %r111 = fptosi <4 x half> poison to <4 x i1>
+ %r112 = fptoui <4 x half> poison to <4 x i8>
+ %r113 = fptosi <4 x half> poison to <4 x i8>
+ %r114 = fptoui <4 x half> poison to <4 x i16>
+ %r115 = fptosi <4 x half> poison to <4 x i16>
+ %r116 = fptoui <4 x half> poison to <4 x i32>
+ %r117 = fptosi <4 x half> poison to <4 x i32>
+ %r118 = fptoui <4 x half> poison to <4 x i64>
+ %r119 = fptosi <4 x half> poison to <4 x i64>
- %r130 = fptoui <8 x half> undef to <8 x i1>
- %r131 = fptosi <8 x half> undef to <8 x i1>
- %r132 = fptoui <8 x half> undef to <8 x i8>
- %r133 = fptosi <8 x half> undef to <8 x i8>
- %r134 = fptoui <8 x half> undef to <8 x i16>
- %r135 = fptosi <8 x half> undef to <8 x i16>
- %r136 = fptoui <8 x half> undef to <8 x i32>
- %r137 = fptosi <8 x half> undef to <8 x i32>
- %r138 = fptoui <8 x half> undef to <8 x i64>
- %r139 = fptosi <8 x half> undef to <8 x i64>
+ %r130 = fptoui <8 x half> poison to <8 x i1>
+ %r131 = fptosi <8 x half> poison to <8 x i1>
+ %r132 = fptoui <8 x half> poison to <8 x i8>
+ %r133 = fptosi <8 x half> poison to <8 x i8>
+ %r134 = fptoui <8 x half> poison to <8 x i16>
+ %r135 = fptosi <8 x half> poison to <8 x i16>
+ %r136 = fptoui <8 x half> poison to <8 x i32>
+ %r137 = fptosi <8 x half> poison to <8 x i32>
+ %r138 = fptoui <8 x half> poison to <8 x i64>
+ %r139 = fptosi <8 x half> poison to <8 x i64>
- %r150 = fptoui <16 x half> undef to <16 x i1>
- %r151 = fptosi <16 x half> undef to <16 x i1>
- %r152 = fptoui <16 x half> undef to <16 x i8>
- %r153 = fptosi <16 x half> undef to <16 x i8>
- %r154 = fptoui <16 x half> undef to <16 x i16>
- %r155 = fptosi <16 x half> undef to <16 x i16>
- %r156 = fptoui <16 x half> undef to <16 x i32>
- %r157 = fptosi <16 x half> undef to <16 x i32>
- %r158 = fptoui <16 x half> undef to <16 x i64>
- %r159 = fptosi <16 x half> undef to <16 x i64>
+ %r150 = fptoui <16 x half> poison to <16 x i1>
+ %r151 = fptosi <16 x half> poison to <16 x i1>
+ %r152 = fptoui <16 x half> poison to <16 x i8>
+ %r153 = fptosi <16 x half> poison to <16 x i8>
+ %r154 = fptoui <16 x half> poison to <16 x i16>
+ %r155 = fptosi <16 x half> poison to <16 x i16>
+ %r156 = fptoui <16 x half> poison to <16 x i32>
+ %r157 = fptosi <16 x half> poison to <16 x i32>
+ %r158 = fptoui <16 x half> poison to <16 x i64>
+ %r159 = fptosi <16 x half> poison to <16 x i64>
- %r250 = uitofp <8 x i1> undef to <8 x half>
- %r251 = sitofp <8 x i1> undef to <8 x half>
- %r252 = uitofp <8 x i8> undef to <8 x half>
- %r253 = sitofp <8 x i8> undef to <8 x half>
- %r254 = uitofp <8 x i16> undef to <8 x half>
- %r255 = sitofp <8 x i16> undef to <8 x half>
- %r256 = uitofp <8 x i32> undef to <8 x half>
- %r257 = sitofp <8 x i32> undef to <8 x half>
- %r258 = uitofp <8 x i64> undef to <8 x half>
- %r259 = sitofp <8 x i64> undef to <8 x half>
+ %r250 = uitofp <8 x i1> poison to <8 x half>
+ %r251 = sitofp <8 x i1> poison to <8 x half>
+ %r252 = uitofp <8 x i8> poison to <8 x half>
+ %r253 = sitofp <8 x i8> poison to <8 x half>
+ %r254 = uitofp <8 x i16> poison to <8 x half>
+ %r255 = sitofp <8 x i16> poison to <8 x half>
+ %r256 = uitofp <8 x i32> poison to <8 x half>
+ %r257 = sitofp <8 x i32> poison to <8 x half>
+ %r258 = uitofp <8 x i64> poison to <8 x half>
+ %r259 = sitofp <8 x i64> poison to <8 x half>
- %r260 = uitofp <16 x i1> undef to <16 x half>
- %r261 = sitofp <16 x i1> undef to <16 x half>
- %r262 = uitofp <16 x i8> undef to <16 x half>
- %r263 = sitofp <16 x i8> undef to <16 x half>
- %r264 = uitofp <16 x i16> undef to <16 x half>
- %r265 = sitofp <16 x i16> undef to <16 x half>
- %r266 = uitofp <16 x i32> undef to <16 x half>
- %r267 = sitofp <16 x i32> undef to <16 x half>
- %r268 = uitofp <16 x i64> undef to <16 x half>
- %r269 = sitofp <16 x i64> undef to <16 x half>
+ %r260 = uitofp <16 x i1> poison to <16 x half>
+ %r261 = sitofp <16 x i1> poison to <16 x half>
+ %r262 = uitofp <16 x i8> poison to <16 x half>
+ %r263 = sitofp <16 x i8> poison to <16 x half>
+ %r264 = uitofp <16 x i16> poison to <16 x half>
+ %r265 = sitofp <16 x i16> poison to <16 x half>
+ %r266 = uitofp <16 x i32> poison to <16 x half>
+ %r267 = sitofp <16 x i32> poison to <16 x half>
+ %r268 = uitofp <16 x i64> poison to <16 x half>
+ %r269 = sitofp <16 x i64> poison to <16 x half>
ret void
}
define void @bf16cast() {
; CHECK-NOFP16-LABEL: 'bf16cast'
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %extf16f32 = fpext bfloat undef to float
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %extv2f16f32 = fpext <2 x bfloat> undef to <2 x float>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %extv4f16f32 = fpext <4 x bfloat> undef to <4 x float>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %extv8f16f32 = fpext <8 x bfloat> undef to <8 x float>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %extv16f16f32 = fpext <16 x bfloat> undef to <16 x float>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %extf16f64 = fpext bfloat undef to double
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %extv2f16f64 = fpext <2 x bfloat> undef to <2 x double>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %extv4f16f64 = fpext <4 x bfloat> undef to <4 x double>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %extv8f16f64 = fpext <8 x bfloat> undef to <8 x double>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %extv16f16f64 = fpext <16 x bfloat> undef to <16 x double>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %truncf16f32 = fptrunc float undef to bfloat
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %truncv2f16f32 = fptrunc <2 x float> undef to <2 x bfloat>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %truncv4f16f32 = fptrunc <4 x float> undef to <4 x bfloat>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %truncv8f16f32 = fptrunc <8 x float> undef to <8 x bfloat>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:30 CodeSize:1 Lat:1 SizeLat:1 for: %truncv16f16f32 = fptrunc <16 x float> undef to <16 x bfloat>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %truncf16f64 = fptrunc double undef to bfloat
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %truncv2f16f64 = fptrunc <2 x double> undef to <2 x bfloat>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %truncv4f16f64 = fptrunc <4 x double> undef to <4 x bfloat>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %truncv8f16f64 = fptrunc <8 x double> undef to <8 x bfloat>
-; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:38 CodeSize:1 Lat:1 SizeLat:1 for: %truncv16f16f64 = fptrunc <16 x double> undef to <16 x bfloat>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %extf16f32 = fpext bfloat poison to float
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %extv2f16f32 = fpext <2 x bfloat> poison to <2 x float>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of 1 for: %extv4f16f32 = fpext <4 x bfloat> poison to <4 x float>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %extv8f16f32 = fpext <8 x bfloat> poison to <8 x float>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %extv16f16f32 = fpext <16 x bfloat> poison to <16 x float>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %extf16f64 = fpext bfloat poison to double
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %extv2f16f64 = fpext <2 x bfloat> poison to <2 x double>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %extv4f16f64 = fpext <4 x bfloat> poison to <4 x double>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %extv8f16f64 = fpext <8 x bfloat> poison to <8 x double>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %extv16f16f64 = fpext <16 x bfloat> poison to <16 x double>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %truncf16f32 = fptrunc float poison to bfloat
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %truncv2f16f32 = fptrunc <2 x float> poison to <2 x bfloat>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %truncv4f16f32 = fptrunc <4 x float> poison to <4 x bfloat>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %truncv8f16f32 = fptrunc <8 x float> poison to <8 x bfloat>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:30 CodeSize:1 Lat:1 SizeLat:1 for: %truncv16f16f32 = fptrunc <16 x float> poison to <16 x bfloat>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %truncf16f64 = fptrunc double poison to bfloat
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %truncv2f16f64 = fptrunc <2 x double> poison to <2 x bfloat>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %truncv4f16f64 = fptrunc <4 x double> poison to <4 x bfloat>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %truncv8f16f64 = fptrunc <8 x double> poison to <8 x bfloat>
+; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:38 CodeSize:1 Lat:1 SizeLat:1 for: %truncv16f16f64 = fptrunc <16 x double> poison to <16 x bfloat>
; CHECK-NOFP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-BF16-LABEL: 'bf16cast'
-; CHECK-BF16-NEXT: Cost Model: Found costs of 1 for: %extf16f32 = fpext bfloat undef to float
-; CHECK-BF16-NEXT: Cost Model: Found costs of 1 for: %extv2f16f32 = fpext <2 x bfloat> undef to <2 x float>
-; CHECK-BF16-NEXT: Cost Model: Found costs of 1 for: %extv4f16f32 = fpext <4 x bfloat> undef to <4 x float>
-; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %extv8f16f32 = fpext <8 x bfloat> undef to <8 x float>
-; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %extv16f16f32 = fpext <16 x bfloat> undef to <16 x float>
-; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %extf16f64 = fpext bfloat undef to double
-; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %extv2f16f64 = fpext <2 x bfloat> undef to <2 x double>
-; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %extv4f16f64 = fpext <4 x bfloat> undef to <4 x double>
-; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %extv8f16f64 = fpext <8 x bfloat> undef to <8 x double>
-; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %extv16f16f64 = fpext <16 x bfloat> undef to <16 x double>
-; CHECK-BF16-NEXT: Cost Model: Found costs of 1 for: %truncf16f32 = fptrunc float undef to bfloat
-; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %truncv2f16f32 = fptrunc <2 x float> undef to <2 x bfloat>
-; CHECK-BF16-NEXT: Cost Model: Found costs of 1 for: %truncv4f16f32 = fptrunc <4 x float> undef to <4 x bfloat>
-; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %truncv8f16f32 = fptrunc <8 x float> undef to <8 x bfloat>
-; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %truncv16f16f32 = fptrunc <16 x float> undef to <16 x bfloat>
-; CHECK-BF16-NEXT: Cost Model: Found costs of 1 for: %truncf16f64 = fptrunc double undef to bfloat
-; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %truncv2f16f64 = fptrunc <2 x double> undef to <2 x bfloat>
-; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %truncv4f16f64 = fptrunc <4 x double> undef to <4 x bfloat>
-; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %truncv8f16f64 = fptrunc <8 x double> undef to <8 x bfloat>
-; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %truncv16f16f64 = fptrunc <16 x double> undef to <16 x bfloat>
+; CHECK-BF16-NEXT: Cost Model: Found costs of 1 for: %extf16f32 = fpext bfloat poison to float
+; CHECK-BF16-NEXT: Cost Model: Found costs of 1 for: %extv2f16f32 = fpext <2 x bfloat> poison to <2 x float>
+; CHECK-BF16-NEXT: Cost Model: Found costs of 1 for: %extv4f16f32 = fpext <4 x bfloat> poison to <4 x float>
+; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %extv8f16f32 = fpext <8 x bfloat> poison to <8 x float>
+; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %extv16f16f32 = fpext <16 x bfloat> poison to <16 x float>
+; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %extf16f64 = fpext bfloat poison to double
+; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %extv2f16f64 = fpext <2 x bfloat> poison to <2 x double>
+; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %extv4f16f64 = fpext <4 x bfloat> poison to <4 x double>
+; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %extv8f16f64 = fpext <8 x bfloat> poison to <8 x double>
+; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %extv16f16f64 = fpext <16 x bfloat> poison to <16 x double>
+; CHECK-BF16-NEXT: Cost Model: Found costs of 1 for: %truncf16f32 = fptrunc float poison to bfloat
+; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %truncv2f16f32 = fptrunc <2 x float> poison to <2 x bfloat>
+; CHECK-BF16-NEXT: Cost Model: Found costs of 1 for: %truncv4f16f32 = fptrunc <4 x float> poison to <4 x bfloat>
+; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %truncv8f16f32 = fptrunc <8 x float> poison to <8 x bfloat>
+; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %truncv16f16f32 = fptrunc <16 x float> poison to <16 x bfloat>
+; CHECK-BF16-NEXT: Cost Model: Found costs of 1 for: %truncf16f64 = fptrunc double poison to bfloat
+; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %truncv2f16f64 = fptrunc <2 x double> poison to <2 x bfloat>
+; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %truncv4f16f64 = fptrunc <4 x double> poison to <4 x bfloat>
+; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %truncv8f16f64 = fptrunc <8 x double> poison to <8 x bfloat>
+; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %truncv16f16f64 = fptrunc <16 x double> poison to <16 x bfloat>
; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
- %extf16f32 = fpext bfloat undef to float
- %extv2f16f32 = fpext <2 x bfloat> undef to <2 x float>
- %extv4f16f32 = fpext <4 x bfloat> undef to <4 x float>
- %extv8f16f32 = fpext <8 x bfloat> undef to <8 x float>
- %extv16f16f32 = fpext <16 x bfloat> undef to <16 x float>
- %extf16f64 = fpext bfloat undef to double
- %extv2f16f64 = fpext <2 x bfloat> undef to <2 x double>
- %extv4f16f64 = fpext <4 x bfloat> undef to <4 x double>
- %extv8f16f64 = fpext <8 x bfloat> undef to <8 x double>
- %extv16f16f64 = fpext <16 x bfloat> undef to <16 x double>
- %truncf16f32 = fptrunc float undef to bfloat
- %truncv2f16f32 = fptrunc <2 x float> undef to <2 x bfloat>
- %truncv4f16f32 = fptrunc <4 x float> undef to <4 x bfloat>
- %truncv8f16f32 = fptrunc <8 x float> undef to <8 x bfloat>
- %truncv16f16f32 = fptrunc <16 x float> undef to <16 x bfloat>
- %truncf16f64 = fptrunc double undef to bfloat
- %truncv2f16f64 = fptrunc <2 x double> undef to <2 x bfloat>
- %truncv4f16f64 = fptrunc <4 x double> undef to <4 x bfloat>
- %truncv8f16f64 = fptrunc <8 x double> undef to <8 x bfloat>
- %truncv16f16f64 = fptrunc <16 x double> undef to <16 x bfloat>
+ %extf16f32 = fpext bfloat poison to float
+ %extv2f16f32 = fpext <2 x bfloat> poison to <2 x float>
+ %extv4f16f32 = fpext <4 x bfloat> poison to <4 x float>
+ %extv8f16f32 = fpext <8 x bfloat> poison to <8 x float>
+ %extv16f16f32 = fpext <16 x bfloat> poison to <16 x float>
+ %extf16f64 = fpext bfloat poison to double
+ %extv2f16f64 = fpext <2 x bfloat> poison to <2 x double>
+ %extv4f16f64 = fpext <4 x bfloat> poison to <4 x double>
+ %extv8f16f64 = fpext <8 x bfloat> poison to <8 x double>
+ %extv16f16f64 = fpext <16 x bfloat> poison to <16 x double>
+ %truncf16f32 = fptrunc float poison to bfloat
+ %truncv2f16f32 = fptrunc <2 x float> poison to <2 x bfloat>
+ %truncv4f16f32 = fptrunc <4 x float> poison to <4 x bfloat>
+ %truncv8f16f32 = fptrunc <8 x float> poison to <8 x bfloat>
+ %truncv16f16f32 = fptrunc <16 x float> poison to <16 x bfloat>
+ %truncf16f64 = fptrunc double poison to bfloat
+ %truncv2f16f64 = fptrunc <2 x double> poison to <2 x bfloat>
+ %truncv4f16f64 = fptrunc <4 x double> poison to <4 x bfloat>
+ %truncv8f16f64 = fptrunc <8 x double> poison to <8 x bfloat>
+ %truncv16f16f64 = fptrunc <16 x double> poison to <16 x bfloat>
ret void
}
diff --git a/llvm/test/Analysis/CostModel/AArch64/no-sve-no-neon.ll b/llvm/test/Analysis/CostModel/AArch64/no-sve-no-neon.ll
index 20b83be..9aea58e 100644
--- a/llvm/test/Analysis/CostModel/AArch64/no-sve-no-neon.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/no-sve-no-neon.ll
@@ -7,13 +7,13 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
define void @uitofp() {
; CHECK-NONEON-LABEL: 'uitofp'
-; CHECK-NONEON-NEXT: Cost Model: Found costs of RThru:48 CodeSize:1 Lat:1 SizeLat:1 for: %conv = uitofp <16 x i64> undef to <16 x float>
+; CHECK-NONEON-NEXT: Cost Model: Found costs of RThru:48 CodeSize:1 Lat:1 SizeLat:1 for: %conv = uitofp <16 x i64> poison to <16 x float>
; CHECK-NONEON-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-WITHSVE-LABEL: 'uitofp'
-; CHECK-WITHSVE-NEXT: Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: %conv = uitofp <16 x i64> undef to <16 x float>
+; CHECK-WITHSVE-NEXT: Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: %conv = uitofp <16 x i64> poison to <16 x float>
; CHECK-WITHSVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
- %conv = uitofp <16 x i64> undef to <16 x float>
+ %conv = uitofp <16 x i64> poison to <16 x float>
ret void
}
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-cast.ll b/llvm/test/Analysis/CostModel/AArch64/sve-cast.ll
index cfb130e..ecb4e14 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-cast.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-cast.ll
@@ -8,1631 +8,1631 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
define void @ext() {
; CHECK-SVE-LABEL: 'ext'
-; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r0 = sext i1 undef to i8
-; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r1 = zext i1 undef to i8
-; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r2 = sext i1 undef to i16
-; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r3 = zext i1 undef to i16
-; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r4 = sext i1 undef to i32
-; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r5 = zext i1 undef to i32
-; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r6 = sext i1 undef to i64
-; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r7 = zext i1 undef to i64
-; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r9 = sext i8 undef to i16
-; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r10 = zext i8 undef to i16
-; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r11 = sext i8 undef to i32
-; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r12 = zext i8 undef to i32
-; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r13 = sext i8 undef to i64
-; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r14 = zext i8 undef to i64
-; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r17 = sext i16 undef to i32
-; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r18 = zext i16 undef to i32
-; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r19 = sext i16 undef to i64
-; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r20 = zext i16 undef to i64
-; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r24 = sext i32 undef to i64
-; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r25 = zext i32 undef to i64
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s2i8i16 = sext <2 x i8> undef to <2 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %z2i8i16 = zext <2 x i8> undef to <2 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s2i8i32 = sext <2 x i8> undef to <2 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %z2i8i32 = zext <2 x i8> undef to <2 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s2i8i64 = sext <2 x i8> undef to <2 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %z2i8i64 = zext <2 x i8> undef to <2 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s2i16i32 = sext <2 x i16> undef to <2 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %z2i16i32 = zext <2 x i16> undef to <2 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s2i16i64 = sext <2 x i16> undef to <2 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %z2i16i64 = zext <2 x i16> undef to <2 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s2i32i64 = sext <2 x i32> undef to <2 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %z2i32i64 = zext <2 x i32> undef to <2 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s4i8i16 = sext <4 x i8> undef to <4 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %z4i8i16 = zext <4 x i8> undef to <4 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s4i8i32 = sext <4 x i8> undef to <4 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %z4i8i32 = zext <4 x i8> undef to <4 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s4i8i64 = sext <4 x i8> undef to <4 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %z4i8i64 = zext <4 x i8> undef to <4 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s4i16i32 = sext <4 x i16> undef to <4 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %z4i16i32 = zext <4 x i16> undef to <4 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s4i16i64 = sext <4 x i16> undef to <4 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %z4i16i64 = zext <4 x i16> undef to <4 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s4i32i64 = sext <4 x i32> undef to <4 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z4i32i64 = zext <4 x i32> undef to <4 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s8i8i16 = sext <8 x i8> undef to <8 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %z8i8i16 = zext <8 x i8> undef to <8 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i32 = sext <8 x i8> undef to <8 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %z8i8i32 = zext <8 x i8> undef to <8 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i64 = sext <8 x i8> undef to <8 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %z8i8i64 = zext <8 x i8> undef to <8 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i32 = sext <8 x i16> undef to <8 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z8i16i32 = zext <8 x i16> undef to <8 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i64 = sext <8 x i16> undef to <8 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %z8i16i64 = zext <8 x i16> undef to <8 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s8i32i64 = sext <8 x i32> undef to <8 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z8i32i64 = zext <8 x i32> undef to <8 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i16 = sext <16 x i8> undef to <16 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i16 = zext <16 x i8> undef to <16 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i32 = sext <16 x i8> undef to <16 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i32 = zext <16 x i8> undef to <16 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i64 = sext <16 x i8> undef to <16 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i64 = zext <16 x i8> undef to <16 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i32 = sext <16 x i16> undef to <16 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z16i16i32 = zext <16 x i16> undef to <16 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i64 = sext <16 x i16> undef to <16 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %z16i16i64 = zext <16 x i16> undef to <16 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %s16i32i64 = sext <16 x i32> undef to <16 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %z16i32i64 = zext <16 x i32> undef to <16 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r0 = sext i1 poison to i8
+; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r1 = zext i1 poison to i8
+; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r2 = sext i1 poison to i16
+; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r3 = zext i1 poison to i16
+; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r4 = sext i1 poison to i32
+; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r5 = zext i1 poison to i32
+; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r6 = sext i1 poison to i64
+; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r7 = zext i1 poison to i64
+; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r9 = sext i8 poison to i16
+; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r10 = zext i8 poison to i16
+; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r11 = sext i8 poison to i32
+; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r12 = zext i8 poison to i32
+; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r13 = sext i8 poison to i64
+; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r14 = zext i8 poison to i64
+; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r17 = sext i16 poison to i32
+; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r18 = zext i16 poison to i32
+; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r19 = sext i16 poison to i64
+; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r20 = zext i16 poison to i64
+; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r24 = sext i32 poison to i64
+; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r25 = zext i32 poison to i64
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s2i8i16 = sext <2 x i8> poison to <2 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %z2i8i16 = zext <2 x i8> poison to <2 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s2i8i32 = sext <2 x i8> poison to <2 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %z2i8i32 = zext <2 x i8> poison to <2 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s2i8i64 = sext <2 x i8> poison to <2 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %z2i8i64 = zext <2 x i8> poison to <2 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s2i16i32 = sext <2 x i16> poison to <2 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %z2i16i32 = zext <2 x i16> poison to <2 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s2i16i64 = sext <2 x i16> poison to <2 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %z2i16i64 = zext <2 x i16> poison to <2 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s2i32i64 = sext <2 x i32> poison to <2 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %z2i32i64 = zext <2 x i32> poison to <2 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s4i8i16 = sext <4 x i8> poison to <4 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %z4i8i16 = zext <4 x i8> poison to <4 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s4i8i32 = sext <4 x i8> poison to <4 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %z4i8i32 = zext <4 x i8> poison to <4 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s4i8i64 = sext <4 x i8> poison to <4 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %z4i8i64 = zext <4 x i8> poison to <4 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s4i16i32 = sext <4 x i16> poison to <4 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %z4i16i32 = zext <4 x i16> poison to <4 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s4i16i64 = sext <4 x i16> poison to <4 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %z4i16i64 = zext <4 x i16> poison to <4 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s4i32i64 = sext <4 x i32> poison to <4 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z4i32i64 = zext <4 x i32> poison to <4 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s8i8i16 = sext <8 x i8> poison to <8 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %z8i8i16 = zext <8 x i8> poison to <8 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i32 = sext <8 x i8> poison to <8 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %z8i8i32 = zext <8 x i8> poison to <8 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i64 = sext <8 x i8> poison to <8 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %z8i8i64 = zext <8 x i8> poison to <8 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i32 = sext <8 x i16> poison to <8 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z8i16i32 = zext <8 x i16> poison to <8 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i64 = sext <8 x i16> poison to <8 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %z8i16i64 = zext <8 x i16> poison to <8 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s8i32i64 = sext <8 x i32> poison to <8 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z8i32i64 = zext <8 x i32> poison to <8 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i16 = sext <16 x i8> poison to <16 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i16 = zext <16 x i8> poison to <16 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i32 = sext <16 x i8> poison to <16 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i32 = zext <16 x i8> poison to <16 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i64 = sext <16 x i8> poison to <16 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i64 = zext <16 x i8> poison to <16 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i32 = sext <16 x i16> poison to <16 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z16i16i32 = zext <16 x i16> poison to <16 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i64 = sext <16 x i16> poison to <16 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %z16i16i64 = zext <16 x i16> poison to <16 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %s16i32i64 = sext <16 x i32> poison to <16 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %z16i32i64 = zext <16 x i32> poison to <16 x i64>
; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; SVE128-NO-NEON-LABEL: 'ext'
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r0 = sext i1 undef to i8
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r1 = zext i1 undef to i8
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r2 = sext i1 undef to i16
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r3 = zext i1 undef to i16
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r4 = sext i1 undef to i32
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r5 = zext i1 undef to i32
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r6 = sext i1 undef to i64
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r7 = zext i1 undef to i64
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r9 = sext i8 undef to i16
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r10 = zext i8 undef to i16
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r11 = sext i8 undef to i32
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r12 = zext i8 undef to i32
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r13 = sext i8 undef to i64
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r14 = zext i8 undef to i64
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r17 = sext i16 undef to i32
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r18 = zext i16 undef to i32
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r19 = sext i16 undef to i64
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r20 = zext i16 undef to i64
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r24 = sext i32 undef to i64
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r25 = zext i32 undef to i64
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %s2i8i16 = sext <2 x i8> undef to <2 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %z2i8i16 = zext <2 x i8> undef to <2 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %s2i8i32 = sext <2 x i8> undef to <2 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %z2i8i32 = zext <2 x i8> undef to <2 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %s2i8i64 = sext <2 x i8> undef to <2 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %z2i8i64 = zext <2 x i8> undef to <2 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %s2i16i32 = sext <2 x i16> undef to <2 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %z2i16i32 = zext <2 x i16> undef to <2 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %s2i16i64 = sext <2 x i16> undef to <2 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %z2i16i64 = zext <2 x i16> undef to <2 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %s2i32i64 = sext <2 x i32> undef to <2 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %z2i32i64 = zext <2 x i32> undef to <2 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %s4i8i16 = sext <4 x i8> undef to <4 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %z4i8i16 = zext <4 x i8> undef to <4 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %s4i8i32 = sext <4 x i8> undef to <4 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %z4i8i32 = zext <4 x i8> undef to <4 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s4i8i64 = sext <4 x i8> undef to <4 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z4i8i64 = zext <4 x i8> undef to <4 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %s4i16i32 = sext <4 x i16> undef to <4 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %z4i16i32 = zext <4 x i16> undef to <4 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s4i16i64 = sext <4 x i16> undef to <4 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z4i16i64 = zext <4 x i16> undef to <4 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s4i32i64 = sext <4 x i32> undef to <4 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z4i32i64 = zext <4 x i32> undef to <4 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %s8i8i16 = sext <8 x i8> undef to <8 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %z8i8i16 = zext <8 x i8> undef to <8 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i32 = sext <8 x i8> undef to <8 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z8i8i32 = zext <8 x i8> undef to <8 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i64 = sext <8 x i8> undef to <8 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z8i8i64 = zext <8 x i8> undef to <8 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i32 = sext <8 x i16> undef to <8 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z8i16i32 = zext <8 x i16> undef to <8 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i64 = sext <8 x i16> undef to <8 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z8i16i64 = zext <8 x i16> undef to <8 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s8i32i64 = sext <8 x i32> undef to <8 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z8i32i64 = zext <8 x i32> undef to <8 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i16 = sext <16 x i8> undef to <16 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i16 = zext <16 x i8> undef to <16 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i32 = sext <16 x i8> undef to <16 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i32 = zext <16 x i8> undef to <16 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i64 = sext <16 x i8> undef to <16 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i64 = zext <16 x i8> undef to <16 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i32 = sext <16 x i16> undef to <16 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z16i16i32 = zext <16 x i16> undef to <16 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i64 = sext <16 x i16> undef to <16 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %z16i16i64 = zext <16 x i16> undef to <16 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %s16i32i64 = sext <16 x i32> undef to <16 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %z16i32i64 = zext <16 x i32> undef to <16 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r0 = sext i1 poison to i8
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r1 = zext i1 poison to i8
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r2 = sext i1 poison to i16
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r3 = zext i1 poison to i16
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r4 = sext i1 poison to i32
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r5 = zext i1 poison to i32
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r6 = sext i1 poison to i64
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r7 = zext i1 poison to i64
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r9 = sext i8 poison to i16
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r10 = zext i8 poison to i16
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r11 = sext i8 poison to i32
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r12 = zext i8 poison to i32
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r13 = sext i8 poison to i64
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r14 = zext i8 poison to i64
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r17 = sext i16 poison to i32
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r18 = zext i16 poison to i32
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r19 = sext i16 poison to i64
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r20 = zext i16 poison to i64
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r24 = sext i32 poison to i64
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r25 = zext i32 poison to i64
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %s2i8i16 = sext <2 x i8> poison to <2 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %z2i8i16 = zext <2 x i8> poison to <2 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %s2i8i32 = sext <2 x i8> poison to <2 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %z2i8i32 = zext <2 x i8> poison to <2 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %s2i8i64 = sext <2 x i8> poison to <2 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %z2i8i64 = zext <2 x i8> poison to <2 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %s2i16i32 = sext <2 x i16> poison to <2 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %z2i16i32 = zext <2 x i16> poison to <2 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %s2i16i64 = sext <2 x i16> poison to <2 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %z2i16i64 = zext <2 x i16> poison to <2 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %s2i32i64 = sext <2 x i32> poison to <2 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %z2i32i64 = zext <2 x i32> poison to <2 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %s4i8i16 = sext <4 x i8> poison to <4 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %z4i8i16 = zext <4 x i8> poison to <4 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %s4i8i32 = sext <4 x i8> poison to <4 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %z4i8i32 = zext <4 x i8> poison to <4 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s4i8i64 = sext <4 x i8> poison to <4 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z4i8i64 = zext <4 x i8> poison to <4 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %s4i16i32 = sext <4 x i16> poison to <4 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %z4i16i32 = zext <4 x i16> poison to <4 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s4i16i64 = sext <4 x i16> poison to <4 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z4i16i64 = zext <4 x i16> poison to <4 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s4i32i64 = sext <4 x i32> poison to <4 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z4i32i64 = zext <4 x i32> poison to <4 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %s8i8i16 = sext <8 x i8> poison to <8 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %z8i8i16 = zext <8 x i8> poison to <8 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i32 = sext <8 x i8> poison to <8 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z8i8i32 = zext <8 x i8> poison to <8 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i64 = sext <8 x i8> poison to <8 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z8i8i64 = zext <8 x i8> poison to <8 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i32 = sext <8 x i16> poison to <8 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z8i16i32 = zext <8 x i16> poison to <8 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i64 = sext <8 x i16> poison to <8 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z8i16i64 = zext <8 x i16> poison to <8 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s8i32i64 = sext <8 x i32> poison to <8 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z8i32i64 = zext <8 x i32> poison to <8 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i16 = sext <16 x i8> poison to <16 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i16 = zext <16 x i8> poison to <16 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i32 = sext <16 x i8> poison to <16 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i32 = zext <16 x i8> poison to <16 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i64 = sext <16 x i8> poison to <16 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i64 = zext <16 x i8> poison to <16 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i32 = sext <16 x i16> poison to <16 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z16i16i32 = zext <16 x i16> poison to <16 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i64 = sext <16 x i16> poison to <16 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %z16i16i64 = zext <16 x i16> poison to <16 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %s16i32i64 = sext <16 x i32> poison to <16 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %z16i32i64 = zext <16 x i32> poison to <16 x i64>
; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; FIXED-MIN-256-LABEL: 'ext'
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r0 = sext i1 undef to i8
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r1 = zext i1 undef to i8
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r2 = sext i1 undef to i16
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r3 = zext i1 undef to i16
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r4 = sext i1 undef to i32
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r5 = zext i1 undef to i32
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r6 = sext i1 undef to i64
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r7 = zext i1 undef to i64
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r9 = sext i8 undef to i16
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r10 = zext i8 undef to i16
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r11 = sext i8 undef to i32
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r12 = zext i8 undef to i32
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r13 = sext i8 undef to i64
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r14 = zext i8 undef to i64
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r17 = sext i16 undef to i32
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r18 = zext i16 undef to i32
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r19 = sext i16 undef to i64
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r20 = zext i16 undef to i64
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r24 = sext i32 undef to i64
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r25 = zext i32 undef to i64
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s2i8i16 = sext <2 x i8> undef to <2 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z2i8i16 = zext <2 x i8> undef to <2 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s2i8i32 = sext <2 x i8> undef to <2 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z2i8i32 = zext <2 x i8> undef to <2 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s2i8i64 = sext <2 x i8> undef to <2 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z2i8i64 = zext <2 x i8> undef to <2 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s2i16i32 = sext <2 x i16> undef to <2 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z2i16i32 = zext <2 x i16> undef to <2 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s2i16i64 = sext <2 x i16> undef to <2 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z2i16i64 = zext <2 x i16> undef to <2 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s2i32i64 = sext <2 x i32> undef to <2 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z2i32i64 = zext <2 x i32> undef to <2 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s4i8i16 = sext <4 x i8> undef to <4 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z4i8i16 = zext <4 x i8> undef to <4 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s4i8i32 = sext <4 x i8> undef to <4 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z4i8i32 = zext <4 x i8> undef to <4 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s4i8i64 = sext <4 x i8> undef to <4 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z4i8i64 = zext <4 x i8> undef to <4 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s4i16i32 = sext <4 x i16> undef to <4 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z4i16i32 = zext <4 x i16> undef to <4 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s4i16i64 = sext <4 x i16> undef to <4 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z4i16i64 = zext <4 x i16> undef to <4 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s4i32i64 = sext <4 x i32> undef to <4 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z4i32i64 = zext <4 x i32> undef to <4 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s8i8i16 = sext <8 x i8> undef to <8 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z8i8i16 = zext <8 x i8> undef to <8 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s8i8i32 = sext <8 x i8> undef to <8 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z8i8i32 = zext <8 x i8> undef to <8 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i64 = sext <8 x i8> undef to <8 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z8i8i64 = zext <8 x i8> undef to <8 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s8i16i32 = sext <8 x i16> undef to <8 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z8i16i32 = zext <8 x i16> undef to <8 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i64 = sext <8 x i16> undef to <8 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z8i16i64 = zext <8 x i16> undef to <8 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i32i64 = sext <8 x i32> undef to <8 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z8i32i64 = zext <8 x i32> undef to <8 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s16i8i16 = sext <16 x i8> undef to <16 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z16i8i16 = zext <16 x i8> undef to <16 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i32 = sext <16 x i8> undef to <16 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i32 = zext <16 x i8> undef to <16 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i64 = sext <16 x i8> undef to <16 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i64 = zext <16 x i8> undef to <16 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i32 = sext <16 x i16> undef to <16 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z16i16i32 = zext <16 x i16> undef to <16 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i64 = sext <16 x i16> undef to <16 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z16i16i64 = zext <16 x i16> undef to <16 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i32i64 = sext <16 x i32> undef to <16 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z16i32i64 = zext <16 x i32> undef to <16 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r0 = sext i1 poison to i8
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r1 = zext i1 poison to i8
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r2 = sext i1 poison to i16
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r3 = zext i1 poison to i16
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r4 = sext i1 poison to i32
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r5 = zext i1 poison to i32
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r6 = sext i1 poison to i64
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r7 = zext i1 poison to i64
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r9 = sext i8 poison to i16
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r10 = zext i8 poison to i16
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r11 = sext i8 poison to i32
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r12 = zext i8 poison to i32
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r13 = sext i8 poison to i64
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r14 = zext i8 poison to i64
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r17 = sext i16 poison to i32
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r18 = zext i16 poison to i32
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r19 = sext i16 poison to i64
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r20 = zext i16 poison to i64
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r24 = sext i32 poison to i64
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r25 = zext i32 poison to i64
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s2i8i16 = sext <2 x i8> poison to <2 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z2i8i16 = zext <2 x i8> poison to <2 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s2i8i32 = sext <2 x i8> poison to <2 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z2i8i32 = zext <2 x i8> poison to <2 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s2i8i64 = sext <2 x i8> poison to <2 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z2i8i64 = zext <2 x i8> poison to <2 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s2i16i32 = sext <2 x i16> poison to <2 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z2i16i32 = zext <2 x i16> poison to <2 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s2i16i64 = sext <2 x i16> poison to <2 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z2i16i64 = zext <2 x i16> poison to <2 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s2i32i64 = sext <2 x i32> poison to <2 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z2i32i64 = zext <2 x i32> poison to <2 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s4i8i16 = sext <4 x i8> poison to <4 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z4i8i16 = zext <4 x i8> poison to <4 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s4i8i32 = sext <4 x i8> poison to <4 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z4i8i32 = zext <4 x i8> poison to <4 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s4i8i64 = sext <4 x i8> poison to <4 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z4i8i64 = zext <4 x i8> poison to <4 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s4i16i32 = sext <4 x i16> poison to <4 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z4i16i32 = zext <4 x i16> poison to <4 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s4i16i64 = sext <4 x i16> poison to <4 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z4i16i64 = zext <4 x i16> poison to <4 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s4i32i64 = sext <4 x i32> poison to <4 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z4i32i64 = zext <4 x i32> poison to <4 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s8i8i16 = sext <8 x i8> poison to <8 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z8i8i16 = zext <8 x i8> poison to <8 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s8i8i32 = sext <8 x i8> poison to <8 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z8i8i32 = zext <8 x i8> poison to <8 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i64 = sext <8 x i8> poison to <8 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z8i8i64 = zext <8 x i8> poison to <8 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s8i16i32 = sext <8 x i16> poison to <8 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z8i16i32 = zext <8 x i16> poison to <8 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i64 = sext <8 x i16> poison to <8 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z8i16i64 = zext <8 x i16> poison to <8 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i32i64 = sext <8 x i32> poison to <8 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z8i32i64 = zext <8 x i32> poison to <8 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s16i8i16 = sext <16 x i8> poison to <16 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z16i8i16 = zext <16 x i8> poison to <16 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i32 = sext <16 x i8> poison to <16 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i32 = zext <16 x i8> poison to <16 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i64 = sext <16 x i8> poison to <16 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i64 = zext <16 x i8> poison to <16 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i32 = sext <16 x i16> poison to <16 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z16i16i32 = zext <16 x i16> poison to <16 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i64 = sext <16 x i16> poison to <16 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z16i16i64 = zext <16 x i16> poison to <16 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i32i64 = sext <16 x i32> poison to <16 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z16i32i64 = zext <16 x i32> poison to <16 x i64>
; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; FIXED-MIN-2048-LABEL: 'ext'
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r0 = sext i1 undef to i8
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r1 = zext i1 undef to i8
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r2 = sext i1 undef to i16
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r3 = zext i1 undef to i16
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r4 = sext i1 undef to i32
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r5 = zext i1 undef to i32
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r6 = sext i1 undef to i64
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r7 = zext i1 undef to i64
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r9 = sext i8 undef to i16
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r10 = zext i8 undef to i16
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r11 = sext i8 undef to i32
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r12 = zext i8 undef to i32
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r13 = sext i8 undef to i64
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r14 = zext i8 undef to i64
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r17 = sext i16 undef to i32
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r18 = zext i16 undef to i32
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r19 = sext i16 undef to i64
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r20 = zext i16 undef to i64
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r24 = sext i32 undef to i64
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r25 = zext i32 undef to i64
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s2i8i16 = sext <2 x i8> undef to <2 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z2i8i16 = zext <2 x i8> undef to <2 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s2i8i32 = sext <2 x i8> undef to <2 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z2i8i32 = zext <2 x i8> undef to <2 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s2i8i64 = sext <2 x i8> undef to <2 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z2i8i64 = zext <2 x i8> undef to <2 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s2i16i32 = sext <2 x i16> undef to <2 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z2i16i32 = zext <2 x i16> undef to <2 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s2i16i64 = sext <2 x i16> undef to <2 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z2i16i64 = zext <2 x i16> undef to <2 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s2i32i64 = sext <2 x i32> undef to <2 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z2i32i64 = zext <2 x i32> undef to <2 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s4i8i16 = sext <4 x i8> undef to <4 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z4i8i16 = zext <4 x i8> undef to <4 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s4i8i32 = sext <4 x i8> undef to <4 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z4i8i32 = zext <4 x i8> undef to <4 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s4i8i64 = sext <4 x i8> undef to <4 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z4i8i64 = zext <4 x i8> undef to <4 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s4i16i32 = sext <4 x i16> undef to <4 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z4i16i32 = zext <4 x i16> undef to <4 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s4i16i64 = sext <4 x i16> undef to <4 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z4i16i64 = zext <4 x i16> undef to <4 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s4i32i64 = sext <4 x i32> undef to <4 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z4i32i64 = zext <4 x i32> undef to <4 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s8i8i16 = sext <8 x i8> undef to <8 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z8i8i16 = zext <8 x i8> undef to <8 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s8i8i32 = sext <8 x i8> undef to <8 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z8i8i32 = zext <8 x i8> undef to <8 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s8i8i64 = sext <8 x i8> undef to <8 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z8i8i64 = zext <8 x i8> undef to <8 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s8i16i32 = sext <8 x i16> undef to <8 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z8i16i32 = zext <8 x i16> undef to <8 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s8i16i64 = sext <8 x i16> undef to <8 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z8i16i64 = zext <8 x i16> undef to <8 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s8i32i64 = sext <8 x i32> undef to <8 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z8i32i64 = zext <8 x i32> undef to <8 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s16i8i16 = sext <16 x i8> undef to <16 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z16i8i16 = zext <16 x i8> undef to <16 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s16i8i32 = sext <16 x i8> undef to <16 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z16i8i32 = zext <16 x i8> undef to <16 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s16i8i64 = sext <16 x i8> undef to <16 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z16i8i64 = zext <16 x i8> undef to <16 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s16i16i32 = sext <16 x i16> undef to <16 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z16i16i32 = zext <16 x i16> undef to <16 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s16i16i64 = sext <16 x i16> undef to <16 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z16i16i64 = zext <16 x i16> undef to <16 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s16i32i64 = sext <16 x i32> undef to <16 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z16i32i64 = zext <16 x i32> undef to <16 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r0 = sext i1 poison to i8
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r1 = zext i1 poison to i8
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r2 = sext i1 poison to i16
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r3 = zext i1 poison to i16
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r4 = sext i1 poison to i32
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r5 = zext i1 poison to i32
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r6 = sext i1 poison to i64
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r7 = zext i1 poison to i64
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r9 = sext i8 poison to i16
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r10 = zext i8 poison to i16
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r11 = sext i8 poison to i32
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r12 = zext i8 poison to i32
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r13 = sext i8 poison to i64
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r14 = zext i8 poison to i64
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r17 = sext i16 poison to i32
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r18 = zext i16 poison to i32
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r19 = sext i16 poison to i64
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r20 = zext i16 poison to i64
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r24 = sext i32 poison to i64
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r25 = zext i32 poison to i64
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s2i8i16 = sext <2 x i8> poison to <2 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z2i8i16 = zext <2 x i8> poison to <2 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s2i8i32 = sext <2 x i8> poison to <2 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z2i8i32 = zext <2 x i8> poison to <2 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s2i8i64 = sext <2 x i8> poison to <2 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z2i8i64 = zext <2 x i8> poison to <2 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s2i16i32 = sext <2 x i16> poison to <2 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z2i16i32 = zext <2 x i16> poison to <2 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s2i16i64 = sext <2 x i16> poison to <2 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z2i16i64 = zext <2 x i16> poison to <2 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s2i32i64 = sext <2 x i32> poison to <2 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z2i32i64 = zext <2 x i32> poison to <2 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s4i8i16 = sext <4 x i8> poison to <4 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z4i8i16 = zext <4 x i8> poison to <4 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s4i8i32 = sext <4 x i8> poison to <4 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z4i8i32 = zext <4 x i8> poison to <4 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s4i8i64 = sext <4 x i8> poison to <4 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z4i8i64 = zext <4 x i8> poison to <4 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s4i16i32 = sext <4 x i16> poison to <4 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z4i16i32 = zext <4 x i16> poison to <4 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s4i16i64 = sext <4 x i16> poison to <4 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z4i16i64 = zext <4 x i16> poison to <4 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s4i32i64 = sext <4 x i32> poison to <4 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z4i32i64 = zext <4 x i32> poison to <4 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s8i8i16 = sext <8 x i8> poison to <8 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z8i8i16 = zext <8 x i8> poison to <8 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s8i8i32 = sext <8 x i8> poison to <8 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z8i8i32 = zext <8 x i8> poison to <8 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s8i8i64 = sext <8 x i8> poison to <8 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z8i8i64 = zext <8 x i8> poison to <8 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s8i16i32 = sext <8 x i16> poison to <8 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z8i16i32 = zext <8 x i16> poison to <8 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s8i16i64 = sext <8 x i16> poison to <8 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z8i16i64 = zext <8 x i16> poison to <8 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s8i32i64 = sext <8 x i32> poison to <8 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z8i32i64 = zext <8 x i32> poison to <8 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s16i8i16 = sext <16 x i8> poison to <16 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z16i8i16 = zext <16 x i8> poison to <16 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s16i8i32 = sext <16 x i8> poison to <16 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z16i8i32 = zext <16 x i8> poison to <16 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s16i8i64 = sext <16 x i8> poison to <16 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z16i8i64 = zext <16 x i8> poison to <16 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s16i16i32 = sext <16 x i16> poison to <16 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z16i16i32 = zext <16 x i16> poison to <16 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s16i16i64 = sext <16 x i16> poison to <16 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z16i16i64 = zext <16 x i16> poison to <16 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s16i32i64 = sext <16 x i32> poison to <16 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z16i32i64 = zext <16 x i32> poison to <16 x i64>
; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
- %r0 = sext i1 undef to i8
- %r1 = zext i1 undef to i8
- %r2 = sext i1 undef to i16
- %r3 = zext i1 undef to i16
- %r4 = sext i1 undef to i32
- %r5 = zext i1 undef to i32
- %r6 = sext i1 undef to i64
- %r7 = zext i1 undef to i64
- %r9 = sext i8 undef to i16
- %r10 = zext i8 undef to i16
- %r11 = sext i8 undef to i32
- %r12 = zext i8 undef to i32
- %r13 = sext i8 undef to i64
- %r14 = zext i8 undef to i64
- %r17 = sext i16 undef to i32
- %r18 = zext i16 undef to i32
- %r19 = sext i16 undef to i64
- %r20 = zext i16 undef to i64
- %r24 = sext i32 undef to i64
- %r25 = zext i32 undef to i64
+ %r0 = sext i1 poison to i8
+ %r1 = zext i1 poison to i8
+ %r2 = sext i1 poison to i16
+ %r3 = zext i1 poison to i16
+ %r4 = sext i1 poison to i32
+ %r5 = zext i1 poison to i32
+ %r6 = sext i1 poison to i64
+ %r7 = zext i1 poison to i64
+ %r9 = sext i8 poison to i16
+ %r10 = zext i8 poison to i16
+ %r11 = sext i8 poison to i32
+ %r12 = zext i8 poison to i32
+ %r13 = sext i8 poison to i64
+ %r14 = zext i8 poison to i64
+ %r17 = sext i16 poison to i32
+ %r18 = zext i16 poison to i32
+ %r19 = sext i16 poison to i64
+ %r20 = zext i16 poison to i64
+ %r24 = sext i32 poison to i64
+ %r25 = zext i32 poison to i64
- %s2i8i16 = sext <2 x i8> undef to <2 x i16>
- %z2i8i16 = zext <2 x i8> undef to <2 x i16>
- %s2i8i32 = sext <2 x i8> undef to <2 x i32>
- %z2i8i32 = zext <2 x i8> undef to <2 x i32>
- %s2i8i64 = sext <2 x i8> undef to <2 x i64>
- %z2i8i64 = zext <2 x i8> undef to <2 x i64>
- %s2i16i32 = sext <2 x i16> undef to <2 x i32>
- %z2i16i32 = zext <2 x i16> undef to <2 x i32>
- %s2i16i64 = sext <2 x i16> undef to <2 x i64>
- %z2i16i64 = zext <2 x i16> undef to <2 x i64>
- %s2i32i64 = sext <2 x i32> undef to <2 x i64>
- %z2i32i64 = zext <2 x i32> undef to <2 x i64>
+ %s2i8i16 = sext <2 x i8> poison to <2 x i16>
+ %z2i8i16 = zext <2 x i8> poison to <2 x i16>
+ %s2i8i32 = sext <2 x i8> poison to <2 x i32>
+ %z2i8i32 = zext <2 x i8> poison to <2 x i32>
+ %s2i8i64 = sext <2 x i8> poison to <2 x i64>
+ %z2i8i64 = zext <2 x i8> poison to <2 x i64>
+ %s2i16i32 = sext <2 x i16> poison to <2 x i32>
+ %z2i16i32 = zext <2 x i16> poison to <2 x i32>
+ %s2i16i64 = sext <2 x i16> poison to <2 x i64>
+ %z2i16i64 = zext <2 x i16> poison to <2 x i64>
+ %s2i32i64 = sext <2 x i32> poison to <2 x i64>
+ %z2i32i64 = zext <2 x i32> poison to <2 x i64>
- %s4i8i16 = sext <4 x i8> undef to <4 x i16>
- %z4i8i16 = zext <4 x i8> undef to <4 x i16>
- %s4i8i32 = sext <4 x i8> undef to <4 x i32>
- %z4i8i32 = zext <4 x i8> undef to <4 x i32>
- %s4i8i64 = sext <4 x i8> undef to <4 x i64>
- %z4i8i64 = zext <4 x i8> undef to <4 x i64>
- %s4i16i32 = sext <4 x i16> undef to <4 x i32>
- %z4i16i32 = zext <4 x i16> undef to <4 x i32>
- %s4i16i64 = sext <4 x i16> undef to <4 x i64>
- %z4i16i64 = zext <4 x i16> undef to <4 x i64>
- %s4i32i64 = sext <4 x i32> undef to <4 x i64>
- %z4i32i64 = zext <4 x i32> undef to <4 x i64>
+ %s4i8i16 = sext <4 x i8> poison to <4 x i16>
+ %z4i8i16 = zext <4 x i8> poison to <4 x i16>
+ %s4i8i32 = sext <4 x i8> poison to <4 x i32>
+ %z4i8i32 = zext <4 x i8> poison to <4 x i32>
+ %s4i8i64 = sext <4 x i8> poison to <4 x i64>
+ %z4i8i64 = zext <4 x i8> poison to <4 x i64>
+ %s4i16i32 = sext <4 x i16> poison to <4 x i32>
+ %z4i16i32 = zext <4 x i16> poison to <4 x i32>
+ %s4i16i64 = sext <4 x i16> poison to <4 x i64>
+ %z4i16i64 = zext <4 x i16> poison to <4 x i64>
+ %s4i32i64 = sext <4 x i32> poison to <4 x i64>
+ %z4i32i64 = zext <4 x i32> poison to <4 x i64>
- %s8i8i16 = sext <8 x i8> undef to <8 x i16>
- %z8i8i16 = zext <8 x i8> undef to <8 x i16>
- %s8i8i32 = sext <8 x i8> undef to <8 x i32>
- %z8i8i32 = zext <8 x i8> undef to <8 x i32>
- %s8i8i64 = sext <8 x i8> undef to <8 x i64>
- %z8i8i64 = zext <8 x i8> undef to <8 x i64>
- %s8i16i32 = sext <8 x i16> undef to <8 x i32>
- %z8i16i32 = zext <8 x i16> undef to <8 x i32>
- %s8i16i64 = sext <8 x i16> undef to <8 x i64>
- %z8i16i64 = zext <8 x i16> undef to <8 x i64>
- %s8i32i64 = sext <8 x i32> undef to <8 x i64>
- %z8i32i64 = zext <8 x i32> undef to <8 x i64>
+ %s8i8i16 = sext <8 x i8> poison to <8 x i16>
+ %z8i8i16 = zext <8 x i8> poison to <8 x i16>
+ %s8i8i32 = sext <8 x i8> poison to <8 x i32>
+ %z8i8i32 = zext <8 x i8> poison to <8 x i32>
+ %s8i8i64 = sext <8 x i8> poison to <8 x i64>
+ %z8i8i64 = zext <8 x i8> poison to <8 x i64>
+ %s8i16i32 = sext <8 x i16> poison to <8 x i32>
+ %z8i16i32 = zext <8 x i16> poison to <8 x i32>
+ %s8i16i64 = sext <8 x i16> poison to <8 x i64>
+ %z8i16i64 = zext <8 x i16> poison to <8 x i64>
+ %s8i32i64 = sext <8 x i32> poison to <8 x i64>
+ %z8i32i64 = zext <8 x i32> poison to <8 x i64>
- %s16i8i16 = sext <16 x i8> undef to <16 x i16>
- %z16i8i16 = zext <16 x i8> undef to <16 x i16>
- %s16i8i32 = sext <16 x i8> undef to <16 x i32>
- %z16i8i32 = zext <16 x i8> undef to <16 x i32>
- %s16i8i64 = sext <16 x i8> undef to <16 x i64>
- %z16i8i64 = zext <16 x i8> undef to <16 x i64>
- %s16i16i32 = sext <16 x i16> undef to <16 x i32>
- %z16i16i32 = zext <16 x i16> undef to <16 x i32>
- %s16i16i64 = sext <16 x i16> undef to <16 x i64>
- %z16i16i64 = zext <16 x i16> undef to <16 x i64>
- %s16i32i64 = sext <16 x i32> undef to <16 x i64>
- %z16i32i64 = zext <16 x i32> undef to <16 x i64>
+ %s16i8i16 = sext <16 x i8> poison to <16 x i16>
+ %z16i8i16 = zext <16 x i8> poison to <16 x i16>
+ %s16i8i32 = sext <16 x i8> poison to <16 x i32>
+ %z16i8i32 = zext <16 x i8> poison to <16 x i32>
+ %s16i8i64 = sext <16 x i8> poison to <16 x i64>
+ %z16i8i64 = zext <16 x i8> poison to <16 x i64>
+ %s16i16i32 = sext <16 x i16> poison to <16 x i32>
+ %z16i16i32 = zext <16 x i16> poison to <16 x i32>
+ %s16i16i64 = sext <16 x i16> poison to <16 x i64>
+ %z16i16i64 = zext <16 x i16> poison to <16 x i64>
+ %s16i32i64 = sext <16 x i32> poison to <16 x i64>
+ %z16i32i64 = zext <16 x i32> poison to <16 x i64>
ret void
}
define void @trunc() {
; CHECK-SVE-LABEL: 'trunc'
-; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r8 = trunc i8 undef to i1
-; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r15 = trunc i16 undef to i1
-; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r16 = trunc i16 undef to i8
-; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r21 = trunc i32 undef to i1
-; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r22 = trunc i32 undef to i8
-; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r23 = trunc i32 undef to i16
-; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r26 = trunc i64 undef to i1
-; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r27 = trunc i64 undef to i8
-; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r28 = trunc i64 undef to i16
-; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r29 = trunc i64 undef to i32
-; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %s2i8i16 = trunc <2 x i16> undef to <2 x i8>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %s2i8i32 = trunc <2 x i32> undef to <2 x i8>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s2i8i64 = trunc <2 x i64> undef to <2 x i8>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %s2i16i32 = trunc <2 x i32> undef to <2 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s2i16i64 = trunc <2 x i64> undef to <2 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s2i32i64 = trunc <2 x i64> undef to <2 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %s4i8i16 = trunc <4 x i16> undef to <4 x i8>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s4i8i32 = trunc <4 x i32> undef to <4 x i8>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s4i8i64 = trunc <4 x i64> undef to <4 x i8>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s4i16i32 = trunc <4 x i32> undef to <4 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s4i16i64 = trunc <4 x i64> undef to <4 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s4i32i64 = trunc <4 x i64> undef to <4 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s8i8i16 = trunc <8 x i16> undef to <8 x i8>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i32 = trunc <8 x i32> undef to <8 x i8>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i64 = trunc <8 x i64> undef to <8 x i8>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s8i16i32 = trunc <8 x i32> undef to <8 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i64 = trunc <8 x i64> undef to <8 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i32i64 = trunc <8 x i64> undef to <8 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s16i8i16 = trunc <16 x i16> undef to <16 x i8>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i32 = trunc <16 x i32> undef to <16 x i8>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i64 = trunc <16 x i64> undef to <16 x i8>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i32 = trunc <16 x i32> undef to <16 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i64 = trunc <16 x i64> undef to <16 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i32i64 = trunc <16 x i64> undef to <16 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r8 = trunc i8 poison to i1
+; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r15 = trunc i16 poison to i1
+; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r16 = trunc i16 poison to i8
+; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r21 = trunc i32 poison to i1
+; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r22 = trunc i32 poison to i8
+; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r23 = trunc i32 poison to i16
+; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r26 = trunc i64 poison to i1
+; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r27 = trunc i64 poison to i8
+; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r28 = trunc i64 poison to i16
+; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r29 = trunc i64 poison to i32
+; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %s2i8i16 = trunc <2 x i16> poison to <2 x i8>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %s2i8i32 = trunc <2 x i32> poison to <2 x i8>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s2i8i64 = trunc <2 x i64> poison to <2 x i8>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %s2i16i32 = trunc <2 x i32> poison to <2 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s2i16i64 = trunc <2 x i64> poison to <2 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s2i32i64 = trunc <2 x i64> poison to <2 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %s4i8i16 = trunc <4 x i16> poison to <4 x i8>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s4i8i32 = trunc <4 x i32> poison to <4 x i8>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s4i8i64 = trunc <4 x i64> poison to <4 x i8>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s4i16i32 = trunc <4 x i32> poison to <4 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s4i16i64 = trunc <4 x i64> poison to <4 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s4i32i64 = trunc <4 x i64> poison to <4 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s8i8i16 = trunc <8 x i16> poison to <8 x i8>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i32 = trunc <8 x i32> poison to <8 x i8>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i64 = trunc <8 x i64> poison to <8 x i8>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s8i16i32 = trunc <8 x i32> poison to <8 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i64 = trunc <8 x i64> poison to <8 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i32i64 = trunc <8 x i64> poison to <8 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s16i8i16 = trunc <16 x i16> poison to <16 x i8>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i32 = trunc <16 x i32> poison to <16 x i8>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i64 = trunc <16 x i64> poison to <16 x i8>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i32 = trunc <16 x i32> poison to <16 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i64 = trunc <16 x i64> poison to <16 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i32i64 = trunc <16 x i64> poison to <16 x i32>
; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; SVE128-NO-NEON-LABEL: 'trunc'
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r8 = trunc i8 undef to i1
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r15 = trunc i16 undef to i1
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r16 = trunc i16 undef to i8
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r21 = trunc i32 undef to i1
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r22 = trunc i32 undef to i8
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r23 = trunc i32 undef to i16
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r26 = trunc i64 undef to i1
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r27 = trunc i64 undef to i8
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r28 = trunc i64 undef to i16
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r29 = trunc i64 undef to i32
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s2i8i16 = trunc <2 x i16> undef to <2 x i8>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s2i8i32 = trunc <2 x i32> undef to <2 x i8>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s2i8i64 = trunc <2 x i64> undef to <2 x i8>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s2i16i32 = trunc <2 x i32> undef to <2 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s2i16i64 = trunc <2 x i64> undef to <2 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s2i32i64 = trunc <2 x i64> undef to <2 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s4i8i16 = trunc <4 x i16> undef to <4 x i8>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s4i8i32 = trunc <4 x i32> undef to <4 x i8>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s4i8i64 = trunc <4 x i64> undef to <4 x i8>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s4i16i32 = trunc <4 x i32> undef to <4 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s4i16i64 = trunc <4 x i64> undef to <4 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s4i32i64 = trunc <4 x i64> undef to <4 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s8i8i16 = trunc <8 x i16> undef to <8 x i8>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s8i8i32 = trunc <8 x i32> undef to <8 x i8>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s8i8i64 = trunc <8 x i64> undef to <8 x i8>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s8i16i32 = trunc <8 x i32> undef to <8 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s8i16i64 = trunc <8 x i64> undef to <8 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s8i32i64 = trunc <8 x i64> undef to <8 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s16i8i16 = trunc <16 x i16> undef to <16 x i8>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s16i8i32 = trunc <16 x i32> undef to <16 x i8>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s16i8i64 = trunc <16 x i64> undef to <16 x i8>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s16i16i32 = trunc <16 x i32> undef to <16 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s16i16i64 = trunc <16 x i64> undef to <16 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s16i32i64 = trunc <16 x i64> undef to <16 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r8 = trunc i8 poison to i1
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r15 = trunc i16 poison to i1
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r16 = trunc i16 poison to i8
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r21 = trunc i32 poison to i1
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r22 = trunc i32 poison to i8
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r23 = trunc i32 poison to i16
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r26 = trunc i64 poison to i1
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r27 = trunc i64 poison to i8
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r28 = trunc i64 poison to i16
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r29 = trunc i64 poison to i32
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s2i8i16 = trunc <2 x i16> poison to <2 x i8>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s2i8i32 = trunc <2 x i32> poison to <2 x i8>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s2i8i64 = trunc <2 x i64> poison to <2 x i8>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s2i16i32 = trunc <2 x i32> poison to <2 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s2i16i64 = trunc <2 x i64> poison to <2 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s2i32i64 = trunc <2 x i64> poison to <2 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s4i8i16 = trunc <4 x i16> poison to <4 x i8>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s4i8i32 = trunc <4 x i32> poison to <4 x i8>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s4i8i64 = trunc <4 x i64> poison to <4 x i8>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s4i16i32 = trunc <4 x i32> poison to <4 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s4i16i64 = trunc <4 x i64> poison to <4 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s4i32i64 = trunc <4 x i64> poison to <4 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s8i8i16 = trunc <8 x i16> poison to <8 x i8>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s8i8i32 = trunc <8 x i32> poison to <8 x i8>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s8i8i64 = trunc <8 x i64> poison to <8 x i8>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s8i16i32 = trunc <8 x i32> poison to <8 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s8i16i64 = trunc <8 x i64> poison to <8 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s8i32i64 = trunc <8 x i64> poison to <8 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s16i8i16 = trunc <16 x i16> poison to <16 x i8>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s16i8i32 = trunc <16 x i32> poison to <16 x i8>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s16i8i64 = trunc <16 x i64> poison to <16 x i8>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s16i16i32 = trunc <16 x i32> poison to <16 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s16i16i64 = trunc <16 x i64> poison to <16 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s16i32i64 = trunc <16 x i64> poison to <16 x i32>
; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; FIXED-MIN-256-LABEL: 'trunc'
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r8 = trunc i8 undef to i1
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r15 = trunc i16 undef to i1
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r16 = trunc i16 undef to i8
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r21 = trunc i32 undef to i1
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r22 = trunc i32 undef to i8
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r23 = trunc i32 undef to i16
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r26 = trunc i64 undef to i1
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r27 = trunc i64 undef to i8
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r28 = trunc i64 undef to i16
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r29 = trunc i64 undef to i32
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s2i8i16 = trunc <2 x i16> undef to <2 x i8>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s2i8i32 = trunc <2 x i32> undef to <2 x i8>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s2i8i64 = trunc <2 x i64> undef to <2 x i8>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s2i16i32 = trunc <2 x i32> undef to <2 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s2i16i64 = trunc <2 x i64> undef to <2 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s2i32i64 = trunc <2 x i64> undef to <2 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s4i8i16 = trunc <4 x i16> undef to <4 x i8>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s4i8i32 = trunc <4 x i32> undef to <4 x i8>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s4i8i64 = trunc <4 x i64> undef to <4 x i8>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s4i16i32 = trunc <4 x i32> undef to <4 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s4i16i64 = trunc <4 x i64> undef to <4 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s4i32i64 = trunc <4 x i64> undef to <4 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s8i8i16 = trunc <8 x i16> undef to <8 x i8>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s8i8i32 = trunc <8 x i32> undef to <8 x i8>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s8i8i64 = trunc <8 x i64> undef to <8 x i8>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s8i16i32 = trunc <8 x i32> undef to <8 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s8i16i64 = trunc <8 x i64> undef to <8 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s8i32i64 = trunc <8 x i64> undef to <8 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s16i8i16 = trunc <16 x i16> undef to <16 x i8>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s16i8i32 = trunc <16 x i32> undef to <16 x i8>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s16i8i64 = trunc <16 x i64> undef to <16 x i8>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s16i16i32 = trunc <16 x i32> undef to <16 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s16i16i64 = trunc <16 x i64> undef to <16 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s16i32i64 = trunc <16 x i64> undef to <16 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r8 = trunc i8 poison to i1
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r15 = trunc i16 poison to i1
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r16 = trunc i16 poison to i8
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r21 = trunc i32 poison to i1
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r22 = trunc i32 poison to i8
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r23 = trunc i32 poison to i16
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r26 = trunc i64 poison to i1
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r27 = trunc i64 poison to i8
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r28 = trunc i64 poison to i16
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r29 = trunc i64 poison to i32
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s2i8i16 = trunc <2 x i16> poison to <2 x i8>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s2i8i32 = trunc <2 x i32> poison to <2 x i8>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s2i8i64 = trunc <2 x i64> poison to <2 x i8>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s2i16i32 = trunc <2 x i32> poison to <2 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s2i16i64 = trunc <2 x i64> poison to <2 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s2i32i64 = trunc <2 x i64> poison to <2 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s4i8i16 = trunc <4 x i16> poison to <4 x i8>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s4i8i32 = trunc <4 x i32> poison to <4 x i8>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s4i8i64 = trunc <4 x i64> poison to <4 x i8>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s4i16i32 = trunc <4 x i32> poison to <4 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s4i16i64 = trunc <4 x i64> poison to <4 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s4i32i64 = trunc <4 x i64> poison to <4 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s8i8i16 = trunc <8 x i16> poison to <8 x i8>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s8i8i32 = trunc <8 x i32> poison to <8 x i8>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s8i8i64 = trunc <8 x i64> poison to <8 x i8>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s8i16i32 = trunc <8 x i32> poison to <8 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s8i16i64 = trunc <8 x i64> poison to <8 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s8i32i64 = trunc <8 x i64> poison to <8 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s16i8i16 = trunc <16 x i16> poison to <16 x i8>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s16i8i32 = trunc <16 x i32> poison to <16 x i8>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s16i8i64 = trunc <16 x i64> poison to <16 x i8>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s16i16i32 = trunc <16 x i32> poison to <16 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s16i16i64 = trunc <16 x i64> poison to <16 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s16i32i64 = trunc <16 x i64> poison to <16 x i32>
; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; FIXED-MIN-2048-LABEL: 'trunc'
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r8 = trunc i8 undef to i1
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r15 = trunc i16 undef to i1
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r16 = trunc i16 undef to i8
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r21 = trunc i32 undef to i1
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r22 = trunc i32 undef to i8
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r23 = trunc i32 undef to i16
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r26 = trunc i64 undef to i1
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r27 = trunc i64 undef to i8
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r28 = trunc i64 undef to i16
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r29 = trunc i64 undef to i32
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s2i8i16 = trunc <2 x i16> undef to <2 x i8>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s2i8i32 = trunc <2 x i32> undef to <2 x i8>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s2i8i64 = trunc <2 x i64> undef to <2 x i8>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s2i16i32 = trunc <2 x i32> undef to <2 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s2i16i64 = trunc <2 x i64> undef to <2 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s2i32i64 = trunc <2 x i64> undef to <2 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s4i8i16 = trunc <4 x i16> undef to <4 x i8>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s4i8i32 = trunc <4 x i32> undef to <4 x i8>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s4i8i64 = trunc <4 x i64> undef to <4 x i8>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s4i16i32 = trunc <4 x i32> undef to <4 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s4i16i64 = trunc <4 x i64> undef to <4 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s4i32i64 = trunc <4 x i64> undef to <4 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s8i8i16 = trunc <8 x i16> undef to <8 x i8>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s8i8i32 = trunc <8 x i32> undef to <8 x i8>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s8i8i64 = trunc <8 x i64> undef to <8 x i8>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s8i16i32 = trunc <8 x i32> undef to <8 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s8i16i64 = trunc <8 x i64> undef to <8 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s8i32i64 = trunc <8 x i64> undef to <8 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s16i8i16 = trunc <16 x i16> undef to <16 x i8>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s16i8i32 = trunc <16 x i32> undef to <16 x i8>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s16i8i64 = trunc <16 x i64> undef to <16 x i8>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s16i16i32 = trunc <16 x i32> undef to <16 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s16i16i64 = trunc <16 x i64> undef to <16 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s16i32i64 = trunc <16 x i64> undef to <16 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r8 = trunc i8 poison to i1
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r15 = trunc i16 poison to i1
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r16 = trunc i16 poison to i8
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r21 = trunc i32 poison to i1
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r22 = trunc i32 poison to i8
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r23 = trunc i32 poison to i16
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r26 = trunc i64 poison to i1
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r27 = trunc i64 poison to i8
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r28 = trunc i64 poison to i16
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r29 = trunc i64 poison to i32
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s2i8i16 = trunc <2 x i16> poison to <2 x i8>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s2i8i32 = trunc <2 x i32> poison to <2 x i8>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s2i8i64 = trunc <2 x i64> poison to <2 x i8>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s2i16i32 = trunc <2 x i32> poison to <2 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s2i16i64 = trunc <2 x i64> poison to <2 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s2i32i64 = trunc <2 x i64> poison to <2 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s4i8i16 = trunc <4 x i16> poison to <4 x i8>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s4i8i32 = trunc <4 x i32> poison to <4 x i8>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s4i8i64 = trunc <4 x i64> poison to <4 x i8>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s4i16i32 = trunc <4 x i32> poison to <4 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s4i16i64 = trunc <4 x i64> poison to <4 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s4i32i64 = trunc <4 x i64> poison to <4 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s8i8i16 = trunc <8 x i16> poison to <8 x i8>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s8i8i32 = trunc <8 x i32> poison to <8 x i8>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s8i8i64 = trunc <8 x i64> poison to <8 x i8>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s8i16i32 = trunc <8 x i32> poison to <8 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s8i16i64 = trunc <8 x i64> poison to <8 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s8i32i64 = trunc <8 x i64> poison to <8 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s16i8i16 = trunc <16 x i16> poison to <16 x i8>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s16i8i32 = trunc <16 x i32> poison to <16 x i8>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s16i8i64 = trunc <16 x i64> poison to <16 x i8>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s16i16i32 = trunc <16 x i32> poison to <16 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s16i16i64 = trunc <16 x i64> poison to <16 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s16i32i64 = trunc <16 x i64> poison to <16 x i32>
; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
- %r8 = trunc i8 undef to i1
- %r15 = trunc i16 undef to i1
- %r16 = trunc i16 undef to i8
- %r21 = trunc i32 undef to i1
- %r22 = trunc i32 undef to i8
- %r23 = trunc i32 undef to i16
- %r26 = trunc i64 undef to i1
- %r27 = trunc i64 undef to i8
- %r28 = trunc i64 undef to i16
- %r29 = trunc i64 undef to i32
+ %r8 = trunc i8 poison to i1
+ %r15 = trunc i16 poison to i1
+ %r16 = trunc i16 poison to i8
+ %r21 = trunc i32 poison to i1
+ %r22 = trunc i32 poison to i8
+ %r23 = trunc i32 poison to i16
+ %r26 = trunc i64 poison to i1
+ %r27 = trunc i64 poison to i8
+ %r28 = trunc i64 poison to i16
+ %r29 = trunc i64 poison to i32
- %s2i8i16 = trunc <2 x i16> undef to <2 x i8>
- %s2i8i32 = trunc <2 x i32> undef to <2 x i8>
- %s2i8i64 = trunc <2 x i64> undef to <2 x i8>
- %s2i16i32 = trunc <2 x i32> undef to <2 x i16>
- %s2i16i64 = trunc <2 x i64> undef to <2 x i16>
- %s2i32i64 = trunc <2 x i64> undef to <2 x i32>
+ %s2i8i16 = trunc <2 x i16> poison to <2 x i8>
+ %s2i8i32 = trunc <2 x i32> poison to <2 x i8>
+ %s2i8i64 = trunc <2 x i64> poison to <2 x i8>
+ %s2i16i32 = trunc <2 x i32> poison to <2 x i16>
+ %s2i16i64 = trunc <2 x i64> poison to <2 x i16>
+ %s2i32i64 = trunc <2 x i64> poison to <2 x i32>
- %s4i8i16 = trunc <4 x i16> undef to <4 x i8>
- %s4i8i32 = trunc <4 x i32> undef to <4 x i8>
- %s4i8i64 = trunc <4 x i64> undef to <4 x i8>
- %s4i16i32 = trunc <4 x i32> undef to <4 x i16>
- %s4i16i64 = trunc <4 x i64> undef to <4 x i16>
- %s4i32i64 = trunc <4 x i64> undef to <4 x i32>
+ %s4i8i16 = trunc <4 x i16> poison to <4 x i8>
+ %s4i8i32 = trunc <4 x i32> poison to <4 x i8>
+ %s4i8i64 = trunc <4 x i64> poison to <4 x i8>
+ %s4i16i32 = trunc <4 x i32> poison to <4 x i16>
+ %s4i16i64 = trunc <4 x i64> poison to <4 x i16>
+ %s4i32i64 = trunc <4 x i64> poison to <4 x i32>
- %s8i8i16 = trunc <8 x i16> undef to <8 x i8>
- %s8i8i32 = trunc <8 x i32> undef to <8 x i8>
- %s8i8i64 = trunc <8 x i64> undef to <8 x i8>
- %s8i16i32 = trunc <8 x i32> undef to <8 x i16>
- %s8i16i64 = trunc <8 x i64> undef to <8 x i16>
- %s8i32i64 = trunc <8 x i64> undef to <8 x i32>
+ %s8i8i16 = trunc <8 x i16> poison to <8 x i8>
+ %s8i8i32 = trunc <8 x i32> poison to <8 x i8>
+ %s8i8i64 = trunc <8 x i64> poison to <8 x i8>
+ %s8i16i32 = trunc <8 x i32> poison to <8 x i16>
+ %s8i16i64 = trunc <8 x i64> poison to <8 x i16>
+ %s8i32i64 = trunc <8 x i64> poison to <8 x i32>
- %s16i8i16 = trunc <16 x i16> undef to <16 x i8>
- %s16i8i32 = trunc <16 x i32> undef to <16 x i8>
- %s16i8i64 = trunc <16 x i64> undef to <16 x i8>
- %s16i16i32 = trunc <16 x i32> undef to <16 x i16>
- %s16i16i64 = trunc <16 x i64> undef to <16 x i16>
- %s16i32i64 = trunc <16 x i64> undef to <16 x i32>
+ %s16i8i16 = trunc <16 x i16> poison to <16 x i8>
+ %s16i8i32 = trunc <16 x i32> poison to <16 x i8>
+ %s16i8i64 = trunc <16 x i64> poison to <16 x i8>
+ %s16i16i32 = trunc <16 x i32> poison to <16 x i16>
+ %s16i16i64 = trunc <16 x i64> poison to <16 x i16>
+ %s16i32i64 = trunc <16 x i64> poison to <16 x i32>
ret void
}
define i32 @casts_no_users() {
; CHECK-SVE-LABEL: 'casts_no_users'
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r30 = fptoui float undef to i1
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r31 = fptosi float undef to i1
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r32 = fptoui float undef to i8
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r33 = fptosi float undef to i8
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r34 = fptoui float undef to i16
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r35 = fptosi float undef to i16
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r36 = fptoui float undef to i32
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r37 = fptosi float undef to i32
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r38 = fptoui float undef to i64
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r39 = fptosi float undef to i64
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r40 = fptoui double undef to i1
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r41 = fptosi double undef to i1
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r42 = fptoui double undef to i8
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r43 = fptosi double undef to i8
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r44 = fptoui double undef to i16
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r45 = fptosi double undef to i16
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r46 = fptoui double undef to i32
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r47 = fptosi double undef to i32
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r48 = fptoui double undef to i64
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r49 = fptosi double undef to i64
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r50 = sitofp i1 undef to float
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r51 = uitofp i1 undef to float
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r52 = sitofp i1 undef to double
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r53 = uitofp i1 undef to double
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r54 = sitofp i8 undef to float
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r55 = uitofp i8 undef to float
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r56 = sitofp i8 undef to double
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r57 = uitofp i8 undef to double
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r58 = sitofp i16 undef to float
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r59 = uitofp i16 undef to float
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r60 = sitofp i16 undef to double
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r61 = uitofp i16 undef to double
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r62 = sitofp i32 undef to float
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r63 = uitofp i32 undef to float
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r64 = sitofp i32 undef to double
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r65 = uitofp i32 undef to double
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r66 = sitofp i64 undef to float
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r67 = uitofp i64 undef to float
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r68 = sitofp i64 undef to double
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r69 = uitofp i64 undef to double
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r80 = fptrunc double undef to float
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r81 = fptrunc <2 x double> undef to <2 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r82 = fptrunc <4 x double> undef to <4 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r83 = fptrunc <8 x double> undef to <8 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r84 = fptrunc <16 x double> undef to <16 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r85 = fpext float undef to double
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r86 = fpext <2 x float> undef to <2 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r87 = fpext <4 x float> undef to <4 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r88 = fpext <8 x float> undef to <8 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r89 = fpext <16 x float> undef to <16 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r90 = fptoui <2 x float> undef to <2 x i1>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r91 = fptosi <2 x float> undef to <2 x i1>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r92 = fptoui <2 x float> undef to <2 x i8>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r93 = fptosi <2 x float> undef to <2 x i8>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r94 = fptoui <2 x float> undef to <2 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r95 = fptosi <2 x float> undef to <2 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r96 = fptoui <2 x float> undef to <2 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r97 = fptosi <2 x float> undef to <2 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x float> undef to <2 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x float> undef to <2 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r100 = fptoui <2 x double> undef to <2 x i1>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r101 = fptosi <2 x double> undef to <2 x i1>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r102 = fptoui <2 x double> undef to <2 x i8>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r103 = fptosi <2 x double> undef to <2 x i8>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r104 = fptoui <2 x double> undef to <2 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r105 = fptosi <2 x double> undef to <2 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r106 = fptoui <2 x double> undef to <2 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r107 = fptosi <2 x double> undef to <2 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r108 = fptoui <2 x double> undef to <2 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r109 = fptosi <2 x double> undef to <2 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r110 = fptoui <4 x float> undef to <4 x i1>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r111 = fptosi <4 x float> undef to <4 x i1>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r112 = fptoui <4 x float> undef to <4 x i8>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r113 = fptosi <4 x float> undef to <4 x i8>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r114 = fptoui <4 x float> undef to <4 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r115 = fptosi <4 x float> undef to <4 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r116 = fptoui <4 x float> undef to <4 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r117 = fptosi <4 x float> undef to <4 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r118 = fptoui <4 x float> undef to <4 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r119 = fptosi <4 x float> undef to <4 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r120 = fptoui <4 x double> undef to <4 x i1>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r121 = fptosi <4 x double> undef to <4 x i1>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r122 = fptoui <4 x double> undef to <4 x i8>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r123 = fptosi <4 x double> undef to <4 x i8>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r124 = fptoui <4 x double> undef to <4 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r125 = fptosi <4 x double> undef to <4 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r126 = fptoui <4 x double> undef to <4 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r127 = fptosi <4 x double> undef to <4 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r128 = fptoui <4 x double> undef to <4 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r129 = fptosi <4 x double> undef to <4 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:41 CodeSize:1 Lat:1 SizeLat:1 for: %r130 = fptoui <8 x float> undef to <8 x i1>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:41 CodeSize:1 Lat:1 SizeLat:1 for: %r131 = fptosi <8 x float> undef to <8 x i1>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r132 = fptoui <8 x float> undef to <8 x i8>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r133 = fptosi <8 x float> undef to <8 x i8>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r134 = fptoui <8 x float> undef to <8 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r135 = fptosi <8 x float> undef to <8 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r136 = fptoui <8 x float> undef to <8 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r137 = fptosi <8 x float> undef to <8 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x float> undef to <8 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x float> undef to <8 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r140 = fptoui <8 x double> undef to <8 x i1>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r141 = fptosi <8 x double> undef to <8 x i1>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r142 = fptoui <8 x double> undef to <8 x i8>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r143 = fptosi <8 x double> undef to <8 x i8>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r144 = fptoui <8 x double> undef to <8 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r145 = fptosi <8 x double> undef to <8 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r146 = fptoui <8 x double> undef to <8 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r147 = fptosi <8 x double> undef to <8 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r148 = fptoui <8 x double> undef to <8 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r149 = fptosi <8 x double> undef to <8 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:83 CodeSize:1 Lat:1 SizeLat:1 for: %r150 = fptoui <16 x float> undef to <16 x i1>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:83 CodeSize:1 Lat:1 SizeLat:1 for: %r151 = fptosi <16 x float> undef to <16 x i1>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r152 = fptoui <16 x float> undef to <16 x i8>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r153 = fptosi <16 x float> undef to <16 x i8>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r154 = fptoui <16 x float> undef to <16 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r155 = fptosi <16 x float> undef to <16 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x float> undef to <16 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x float> undef to <16 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x float> undef to <16 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x float> undef to <16 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:87 CodeSize:1 Lat:1 SizeLat:1 for: %r160 = fptoui <16 x double> undef to <16 x i1>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:87 CodeSize:1 Lat:1 SizeLat:1 for: %r161 = fptosi <16 x double> undef to <16 x i1>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:23 CodeSize:1 Lat:1 SizeLat:1 for: %r162 = fptoui <16 x double> undef to <16 x i8>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:23 CodeSize:1 Lat:1 SizeLat:1 for: %r163 = fptosi <16 x double> undef to <16 x i8>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %r164 = fptoui <16 x double> undef to <16 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %r165 = fptosi <16 x double> undef to <16 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r166 = fptoui <16 x double> undef to <16 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r167 = fptosi <16 x double> undef to <16 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r168 = fptoui <16 x double> undef to <16 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r169 = fptosi <16 x double> undef to <16 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r170 = uitofp <2 x i1> undef to <2 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r171 = sitofp <2 x i1> undef to <2 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r172 = uitofp <2 x i8> undef to <2 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r173 = sitofp <2 x i8> undef to <2 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r174 = uitofp <2 x i16> undef to <2 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r175 = sitofp <2 x i16> undef to <2 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r176 = uitofp <2 x i32> undef to <2 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r177 = sitofp <2 x i32> undef to <2 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r178 = uitofp <2 x i64> undef to <2 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r179 = sitofp <2 x i64> undef to <2 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r180 = uitofp <2 x i1> undef to <2 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r181 = sitofp <2 x i1> undef to <2 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r182 = uitofp <2 x i8> undef to <2 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r183 = sitofp <2 x i8> undef to <2 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r184 = uitofp <2 x i16> undef to <2 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r185 = sitofp <2 x i16> undef to <2 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r186 = uitofp <2 x i32> undef to <2 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r187 = sitofp <2 x i32> undef to <2 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r188 = uitofp <2 x i64> undef to <2 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r189 = sitofp <2 x i64> undef to <2 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r190 = uitofp <4 x i1> undef to <4 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r191 = sitofp <4 x i1> undef to <4 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r192 = uitofp <4 x i8> undef to <4 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r193 = sitofp <4 x i8> undef to <4 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r194 = uitofp <4 x i16> undef to <4 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r195 = sitofp <4 x i16> undef to <4 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r196 = uitofp <4 x i32> undef to <4 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r197 = sitofp <4 x i32> undef to <4 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: %r198 = uitofp <4 x i64> undef to <4 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: %r199 = sitofp <4 x i64> undef to <4 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r200 = uitofp <4 x i1> undef to <4 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r201 = sitofp <4 x i1> undef to <4 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %r202 = uitofp <4 x i8> undef to <4 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %r203 = sitofp <4 x i8> undef to <4 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %r204 = uitofp <4 x i16> undef to <4 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %r205 = sitofp <4 x i16> undef to <4 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r206 = uitofp <4 x i32> undef to <4 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r207 = sitofp <4 x i32> undef to <4 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r208 = uitofp <4 x i64> undef to <4 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r209 = sitofp <4 x i64> undef to <4 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r210 = uitofp <8 x i1> undef to <8 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r211 = sitofp <8 x i1> undef to <8 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r212 = uitofp <8 x i8> undef to <8 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r213 = sitofp <8 x i8> undef to <8 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r214 = uitofp <8 x i16> undef to <8 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r215 = sitofp <8 x i16> undef to <8 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r216 = uitofp <8 x i32> undef to <8 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r217 = sitofp <8 x i32> undef to <8 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:36 CodeSize:1 Lat:1 SizeLat:1 for: %r218 = uitofp <8 x i64> undef to <8 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:36 CodeSize:1 Lat:1 SizeLat:1 for: %r219 = sitofp <8 x i64> undef to <8 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r220 = uitofp <8 x i1> undef to <8 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r221 = sitofp <8 x i1> undef to <8 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r222 = uitofp <8 x i8> undef to <8 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r223 = sitofp <8 x i8> undef to <8 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r224 = uitofp <8 x i16> undef to <8 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r225 = sitofp <8 x i16> undef to <8 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r226 = uitofp <8 x i32> undef to <8 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r227 = sitofp <8 x i32> undef to <8 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r228 = uitofp <8 x i64> undef to <8 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r229 = sitofp <8 x i64> undef to <8 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r230 = uitofp <16 x i1> undef to <16 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r231 = sitofp <16 x i1> undef to <16 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r232 = uitofp <16 x i8> undef to <16 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r233 = sitofp <16 x i8> undef to <16 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r234 = uitofp <16 x i16> undef to <16 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r235 = sitofp <16 x i16> undef to <16 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r236 = uitofp <16 x i32> undef to <16 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r237 = sitofp <16 x i32> undef to <16 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:72 CodeSize:1 Lat:1 SizeLat:1 for: %r238 = uitofp <16 x i64> undef to <16 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:72 CodeSize:1 Lat:1 SizeLat:1 for: %r239 = sitofp <16 x i64> undef to <16 x float>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %r240 = uitofp <16 x i1> undef to <16 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %r241 = sitofp <16 x i1> undef to <16 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:39 CodeSize:1 Lat:1 SizeLat:1 for: %r242 = uitofp <16 x i8> undef to <16 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:39 CodeSize:1 Lat:1 SizeLat:1 for: %r243 = sitofp <16 x i8> undef to <16 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:38 CodeSize:1 Lat:1 SizeLat:1 for: %r244 = uitofp <16 x i16> undef to <16 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:38 CodeSize:1 Lat:1 SizeLat:1 for: %r245 = sitofp <16 x i16> undef to <16 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: %r246 = uitofp <16 x i32> undef to <16 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: %r247 = sitofp <16 x i32> undef to <16 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r248 = uitofp <16 x i64> undef to <16 x double>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r249 = sitofp <16 x i64> undef to <16 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r30 = fptoui float poison to i1
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r31 = fptosi float poison to i1
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r32 = fptoui float poison to i8
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r33 = fptosi float poison to i8
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r34 = fptoui float poison to i16
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r35 = fptosi float poison to i16
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r36 = fptoui float poison to i32
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r37 = fptosi float poison to i32
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r38 = fptoui float poison to i64
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r39 = fptosi float poison to i64
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r40 = fptoui double poison to i1
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r41 = fptosi double poison to i1
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r42 = fptoui double poison to i8
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r43 = fptosi double poison to i8
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r44 = fptoui double poison to i16
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r45 = fptosi double poison to i16
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r46 = fptoui double poison to i32
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r47 = fptosi double poison to i32
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r48 = fptoui double poison to i64
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r49 = fptosi double poison to i64
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r50 = sitofp i1 poison to float
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r51 = uitofp i1 poison to float
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r52 = sitofp i1 poison to double
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r53 = uitofp i1 poison to double
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r54 = sitofp i8 poison to float
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r55 = uitofp i8 poison to float
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r56 = sitofp i8 poison to double
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r57 = uitofp i8 poison to double
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r58 = sitofp i16 poison to float
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r59 = uitofp i16 poison to float
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r60 = sitofp i16 poison to double
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r61 = uitofp i16 poison to double
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r62 = sitofp i32 poison to float
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r63 = uitofp i32 poison to float
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r64 = sitofp i32 poison to double
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r65 = uitofp i32 poison to double
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r66 = sitofp i64 poison to float
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r67 = uitofp i64 poison to float
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r68 = sitofp i64 poison to double
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r69 = uitofp i64 poison to double
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r80 = fptrunc double poison to float
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r81 = fptrunc <2 x double> poison to <2 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r82 = fptrunc <4 x double> poison to <4 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r83 = fptrunc <8 x double> poison to <8 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r84 = fptrunc <16 x double> poison to <16 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r85 = fpext float poison to double
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r86 = fpext <2 x float> poison to <2 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r87 = fpext <4 x float> poison to <4 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r88 = fpext <8 x float> poison to <8 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r89 = fpext <16 x float> poison to <16 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r90 = fptoui <2 x float> poison to <2 x i1>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r91 = fptosi <2 x float> poison to <2 x i1>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r92 = fptoui <2 x float> poison to <2 x i8>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r93 = fptosi <2 x float> poison to <2 x i8>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r94 = fptoui <2 x float> poison to <2 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r95 = fptosi <2 x float> poison to <2 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r96 = fptoui <2 x float> poison to <2 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r97 = fptosi <2 x float> poison to <2 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x float> poison to <2 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x float> poison to <2 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r100 = fptoui <2 x double> poison to <2 x i1>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r101 = fptosi <2 x double> poison to <2 x i1>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r102 = fptoui <2 x double> poison to <2 x i8>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r103 = fptosi <2 x double> poison to <2 x i8>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r104 = fptoui <2 x double> poison to <2 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r105 = fptosi <2 x double> poison to <2 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r106 = fptoui <2 x double> poison to <2 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r107 = fptosi <2 x double> poison to <2 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r108 = fptoui <2 x double> poison to <2 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r109 = fptosi <2 x double> poison to <2 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r110 = fptoui <4 x float> poison to <4 x i1>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r111 = fptosi <4 x float> poison to <4 x i1>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r112 = fptoui <4 x float> poison to <4 x i8>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r113 = fptosi <4 x float> poison to <4 x i8>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r114 = fptoui <4 x float> poison to <4 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r115 = fptosi <4 x float> poison to <4 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r116 = fptoui <4 x float> poison to <4 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r117 = fptosi <4 x float> poison to <4 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r118 = fptoui <4 x float> poison to <4 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r119 = fptosi <4 x float> poison to <4 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r120 = fptoui <4 x double> poison to <4 x i1>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r121 = fptosi <4 x double> poison to <4 x i1>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r122 = fptoui <4 x double> poison to <4 x i8>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r123 = fptosi <4 x double> poison to <4 x i8>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r124 = fptoui <4 x double> poison to <4 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r125 = fptosi <4 x double> poison to <4 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r126 = fptoui <4 x double> poison to <4 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r127 = fptosi <4 x double> poison to <4 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r128 = fptoui <4 x double> poison to <4 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r129 = fptosi <4 x double> poison to <4 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:41 CodeSize:1 Lat:1 SizeLat:1 for: %r130 = fptoui <8 x float> poison to <8 x i1>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:41 CodeSize:1 Lat:1 SizeLat:1 for: %r131 = fptosi <8 x float> poison to <8 x i1>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r132 = fptoui <8 x float> poison to <8 x i8>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r133 = fptosi <8 x float> poison to <8 x i8>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r134 = fptoui <8 x float> poison to <8 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r135 = fptosi <8 x float> poison to <8 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r136 = fptoui <8 x float> poison to <8 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r137 = fptosi <8 x float> poison to <8 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x float> poison to <8 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x float> poison to <8 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r140 = fptoui <8 x double> poison to <8 x i1>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r141 = fptosi <8 x double> poison to <8 x i1>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r142 = fptoui <8 x double> poison to <8 x i8>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r143 = fptosi <8 x double> poison to <8 x i8>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r144 = fptoui <8 x double> poison to <8 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r145 = fptosi <8 x double> poison to <8 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r146 = fptoui <8 x double> poison to <8 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r147 = fptosi <8 x double> poison to <8 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r148 = fptoui <8 x double> poison to <8 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r149 = fptosi <8 x double> poison to <8 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:83 CodeSize:1 Lat:1 SizeLat:1 for: %r150 = fptoui <16 x float> poison to <16 x i1>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:83 CodeSize:1 Lat:1 SizeLat:1 for: %r151 = fptosi <16 x float> poison to <16 x i1>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r152 = fptoui <16 x float> poison to <16 x i8>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r153 = fptosi <16 x float> poison to <16 x i8>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r154 = fptoui <16 x float> poison to <16 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r155 = fptosi <16 x float> poison to <16 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x float> poison to <16 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x float> poison to <16 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x float> poison to <16 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x float> poison to <16 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:87 CodeSize:1 Lat:1 SizeLat:1 for: %r160 = fptoui <16 x double> poison to <16 x i1>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:87 CodeSize:1 Lat:1 SizeLat:1 for: %r161 = fptosi <16 x double> poison to <16 x i1>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:23 CodeSize:1 Lat:1 SizeLat:1 for: %r162 = fptoui <16 x double> poison to <16 x i8>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:23 CodeSize:1 Lat:1 SizeLat:1 for: %r163 = fptosi <16 x double> poison to <16 x i8>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %r164 = fptoui <16 x double> poison to <16 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %r165 = fptosi <16 x double> poison to <16 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r166 = fptoui <16 x double> poison to <16 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r167 = fptosi <16 x double> poison to <16 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r168 = fptoui <16 x double> poison to <16 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r169 = fptosi <16 x double> poison to <16 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r170 = uitofp <2 x i1> poison to <2 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r171 = sitofp <2 x i1> poison to <2 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r172 = uitofp <2 x i8> poison to <2 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r173 = sitofp <2 x i8> poison to <2 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r174 = uitofp <2 x i16> poison to <2 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r175 = sitofp <2 x i16> poison to <2 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r176 = uitofp <2 x i32> poison to <2 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r177 = sitofp <2 x i32> poison to <2 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r178 = uitofp <2 x i64> poison to <2 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r179 = sitofp <2 x i64> poison to <2 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r180 = uitofp <2 x i1> poison to <2 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r181 = sitofp <2 x i1> poison to <2 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r182 = uitofp <2 x i8> poison to <2 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r183 = sitofp <2 x i8> poison to <2 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r184 = uitofp <2 x i16> poison to <2 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r185 = sitofp <2 x i16> poison to <2 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r186 = uitofp <2 x i32> poison to <2 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r187 = sitofp <2 x i32> poison to <2 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r188 = uitofp <2 x i64> poison to <2 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r189 = sitofp <2 x i64> poison to <2 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r190 = uitofp <4 x i1> poison to <4 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r191 = sitofp <4 x i1> poison to <4 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r192 = uitofp <4 x i8> poison to <4 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r193 = sitofp <4 x i8> poison to <4 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r194 = uitofp <4 x i16> poison to <4 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r195 = sitofp <4 x i16> poison to <4 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r196 = uitofp <4 x i32> poison to <4 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r197 = sitofp <4 x i32> poison to <4 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: %r198 = uitofp <4 x i64> poison to <4 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: %r199 = sitofp <4 x i64> poison to <4 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r200 = uitofp <4 x i1> poison to <4 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r201 = sitofp <4 x i1> poison to <4 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %r202 = uitofp <4 x i8> poison to <4 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %r203 = sitofp <4 x i8> poison to <4 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %r204 = uitofp <4 x i16> poison to <4 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %r205 = sitofp <4 x i16> poison to <4 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r206 = uitofp <4 x i32> poison to <4 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r207 = sitofp <4 x i32> poison to <4 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r208 = uitofp <4 x i64> poison to <4 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r209 = sitofp <4 x i64> poison to <4 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r210 = uitofp <8 x i1> poison to <8 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r211 = sitofp <8 x i1> poison to <8 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r212 = uitofp <8 x i8> poison to <8 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r213 = sitofp <8 x i8> poison to <8 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r214 = uitofp <8 x i16> poison to <8 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r215 = sitofp <8 x i16> poison to <8 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r216 = uitofp <8 x i32> poison to <8 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r217 = sitofp <8 x i32> poison to <8 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:36 CodeSize:1 Lat:1 SizeLat:1 for: %r218 = uitofp <8 x i64> poison to <8 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:36 CodeSize:1 Lat:1 SizeLat:1 for: %r219 = sitofp <8 x i64> poison to <8 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r220 = uitofp <8 x i1> poison to <8 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r221 = sitofp <8 x i1> poison to <8 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r222 = uitofp <8 x i8> poison to <8 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r223 = sitofp <8 x i8> poison to <8 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r224 = uitofp <8 x i16> poison to <8 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r225 = sitofp <8 x i16> poison to <8 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r226 = uitofp <8 x i32> poison to <8 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r227 = sitofp <8 x i32> poison to <8 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r228 = uitofp <8 x i64> poison to <8 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r229 = sitofp <8 x i64> poison to <8 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r230 = uitofp <16 x i1> poison to <16 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r231 = sitofp <16 x i1> poison to <16 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r232 = uitofp <16 x i8> poison to <16 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r233 = sitofp <16 x i8> poison to <16 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r234 = uitofp <16 x i16> poison to <16 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r235 = sitofp <16 x i16> poison to <16 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r236 = uitofp <16 x i32> poison to <16 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r237 = sitofp <16 x i32> poison to <16 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:72 CodeSize:1 Lat:1 SizeLat:1 for: %r238 = uitofp <16 x i64> poison to <16 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:72 CodeSize:1 Lat:1 SizeLat:1 for: %r239 = sitofp <16 x i64> poison to <16 x float>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %r240 = uitofp <16 x i1> poison to <16 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %r241 = sitofp <16 x i1> poison to <16 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:39 CodeSize:1 Lat:1 SizeLat:1 for: %r242 = uitofp <16 x i8> poison to <16 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:39 CodeSize:1 Lat:1 SizeLat:1 for: %r243 = sitofp <16 x i8> poison to <16 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:38 CodeSize:1 Lat:1 SizeLat:1 for: %r244 = uitofp <16 x i16> poison to <16 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:38 CodeSize:1 Lat:1 SizeLat:1 for: %r245 = sitofp <16 x i16> poison to <16 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: %r246 = uitofp <16 x i32> poison to <16 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: %r247 = sitofp <16 x i32> poison to <16 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r248 = uitofp <16 x i64> poison to <16 x double>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r249 = sitofp <16 x i64> poison to <16 x double>
; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
;
; SVE128-NO-NEON-LABEL: 'casts_no_users'
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r30 = fptoui float undef to i1
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r31 = fptosi float undef to i1
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r32 = fptoui float undef to i8
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r33 = fptosi float undef to i8
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r34 = fptoui float undef to i16
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r35 = fptosi float undef to i16
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r36 = fptoui float undef to i32
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r37 = fptosi float undef to i32
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r38 = fptoui float undef to i64
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r39 = fptosi float undef to i64
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r40 = fptoui double undef to i1
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r41 = fptosi double undef to i1
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r42 = fptoui double undef to i8
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r43 = fptosi double undef to i8
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r44 = fptoui double undef to i16
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r45 = fptosi double undef to i16
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r46 = fptoui double undef to i32
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r47 = fptosi double undef to i32
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r48 = fptoui double undef to i64
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r49 = fptosi double undef to i64
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r50 = sitofp i1 undef to float
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r51 = uitofp i1 undef to float
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r52 = sitofp i1 undef to double
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r53 = uitofp i1 undef to double
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r54 = sitofp i8 undef to float
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r55 = uitofp i8 undef to float
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r56 = sitofp i8 undef to double
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r57 = uitofp i8 undef to double
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r58 = sitofp i16 undef to float
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r59 = uitofp i16 undef to float
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r60 = sitofp i16 undef to double
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r61 = uitofp i16 undef to double
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r62 = sitofp i32 undef to float
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r63 = uitofp i32 undef to float
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r64 = sitofp i32 undef to double
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r65 = uitofp i32 undef to double
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r66 = sitofp i64 undef to float
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r67 = uitofp i64 undef to float
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r68 = sitofp i64 undef to double
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r69 = uitofp i64 undef to double
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r80 = fptrunc double undef to float
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r81 = fptrunc <2 x double> undef to <2 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r82 = fptrunc <4 x double> undef to <4 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r83 = fptrunc <8 x double> undef to <8 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r84 = fptrunc <16 x double> undef to <16 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r85 = fpext float undef to double
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r86 = fpext <2 x float> undef to <2 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r87 = fpext <4 x float> undef to <4 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r88 = fpext <8 x float> undef to <8 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r89 = fpext <16 x float> undef to <16 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r90 = fptoui <2 x float> undef to <2 x i1>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r91 = fptosi <2 x float> undef to <2 x i1>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r92 = fptoui <2 x float> undef to <2 x i8>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r93 = fptosi <2 x float> undef to <2 x i8>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r94 = fptoui <2 x float> undef to <2 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r95 = fptosi <2 x float> undef to <2 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r96 = fptoui <2 x float> undef to <2 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r97 = fptosi <2 x float> undef to <2 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r98 = fptoui <2 x float> undef to <2 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r99 = fptosi <2 x float> undef to <2 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r100 = fptoui <2 x double> undef to <2 x i1>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r101 = fptosi <2 x double> undef to <2 x i1>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r102 = fptoui <2 x double> undef to <2 x i8>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r103 = fptosi <2 x double> undef to <2 x i8>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r104 = fptoui <2 x double> undef to <2 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r105 = fptosi <2 x double> undef to <2 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r106 = fptoui <2 x double> undef to <2 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r107 = fptosi <2 x double> undef to <2 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r108 = fptoui <2 x double> undef to <2 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r109 = fptosi <2 x double> undef to <2 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r110 = fptoui <4 x float> undef to <4 x i1>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r111 = fptosi <4 x float> undef to <4 x i1>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r112 = fptoui <4 x float> undef to <4 x i8>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r113 = fptosi <4 x float> undef to <4 x i8>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r114 = fptoui <4 x float> undef to <4 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r115 = fptosi <4 x float> undef to <4 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r116 = fptoui <4 x float> undef to <4 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r117 = fptosi <4 x float> undef to <4 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r118 = fptoui <4 x float> undef to <4 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r119 = fptosi <4 x float> undef to <4 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r120 = fptoui <4 x double> undef to <4 x i1>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r121 = fptosi <4 x double> undef to <4 x i1>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r122 = fptoui <4 x double> undef to <4 x i8>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r123 = fptosi <4 x double> undef to <4 x i8>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r124 = fptoui <4 x double> undef to <4 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r125 = fptosi <4 x double> undef to <4 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r126 = fptoui <4 x double> undef to <4 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r127 = fptosi <4 x double> undef to <4 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r128 = fptoui <4 x double> undef to <4 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r129 = fptosi <4 x double> undef to <4 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r130 = fptoui <8 x float> undef to <8 x i1>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r131 = fptosi <8 x float> undef to <8 x i1>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r132 = fptoui <8 x float> undef to <8 x i8>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r133 = fptosi <8 x float> undef to <8 x i8>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r134 = fptoui <8 x float> undef to <8 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r135 = fptosi <8 x float> undef to <8 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r136 = fptoui <8 x float> undef to <8 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r137 = fptosi <8 x float> undef to <8 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x float> undef to <8 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x float> undef to <8 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r140 = fptoui <8 x double> undef to <8 x i1>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r141 = fptosi <8 x double> undef to <8 x i1>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r142 = fptoui <8 x double> undef to <8 x i8>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r143 = fptosi <8 x double> undef to <8 x i8>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r144 = fptoui <8 x double> undef to <8 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r145 = fptosi <8 x double> undef to <8 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r146 = fptoui <8 x double> undef to <8 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r147 = fptosi <8 x double> undef to <8 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r148 = fptoui <8 x double> undef to <8 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r149 = fptosi <8 x double> undef to <8 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r150 = fptoui <16 x float> undef to <16 x i1>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r151 = fptosi <16 x float> undef to <16 x i1>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r152 = fptoui <16 x float> undef to <16 x i8>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r153 = fptosi <16 x float> undef to <16 x i8>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r154 = fptoui <16 x float> undef to <16 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r155 = fptosi <16 x float> undef to <16 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x float> undef to <16 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x float> undef to <16 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x float> undef to <16 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x float> undef to <16 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r160 = fptoui <16 x double> undef to <16 x i1>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r161 = fptosi <16 x double> undef to <16 x i1>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r162 = fptoui <16 x double> undef to <16 x i8>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r163 = fptosi <16 x double> undef to <16 x i8>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r164 = fptoui <16 x double> undef to <16 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r165 = fptosi <16 x double> undef to <16 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r166 = fptoui <16 x double> undef to <16 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r167 = fptosi <16 x double> undef to <16 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r168 = fptoui <16 x double> undef to <16 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r169 = fptosi <16 x double> undef to <16 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r170 = uitofp <2 x i1> undef to <2 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r171 = sitofp <2 x i1> undef to <2 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r172 = uitofp <2 x i8> undef to <2 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r173 = sitofp <2 x i8> undef to <2 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r174 = uitofp <2 x i16> undef to <2 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r175 = sitofp <2 x i16> undef to <2 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r176 = uitofp <2 x i32> undef to <2 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r177 = sitofp <2 x i32> undef to <2 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r178 = uitofp <2 x i64> undef to <2 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r179 = sitofp <2 x i64> undef to <2 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r180 = uitofp <2 x i1> undef to <2 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r181 = sitofp <2 x i1> undef to <2 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r182 = uitofp <2 x i8> undef to <2 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r183 = sitofp <2 x i8> undef to <2 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r184 = uitofp <2 x i16> undef to <2 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r185 = sitofp <2 x i16> undef to <2 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r186 = uitofp <2 x i32> undef to <2 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r187 = sitofp <2 x i32> undef to <2 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r188 = uitofp <2 x i64> undef to <2 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r189 = sitofp <2 x i64> undef to <2 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r190 = uitofp <4 x i1> undef to <4 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r191 = sitofp <4 x i1> undef to <4 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r192 = uitofp <4 x i8> undef to <4 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r193 = sitofp <4 x i8> undef to <4 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r194 = uitofp <4 x i16> undef to <4 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r195 = sitofp <4 x i16> undef to <4 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r196 = uitofp <4 x i32> undef to <4 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r197 = sitofp <4 x i32> undef to <4 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r198 = uitofp <4 x i64> undef to <4 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r199 = sitofp <4 x i64> undef to <4 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r200 = uitofp <4 x i1> undef to <4 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r201 = sitofp <4 x i1> undef to <4 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r202 = uitofp <4 x i8> undef to <4 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r203 = sitofp <4 x i8> undef to <4 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r204 = uitofp <4 x i16> undef to <4 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r205 = sitofp <4 x i16> undef to <4 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r206 = uitofp <4 x i32> undef to <4 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r207 = sitofp <4 x i32> undef to <4 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r208 = uitofp <4 x i64> undef to <4 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r209 = sitofp <4 x i64> undef to <4 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r210 = uitofp <8 x i1> undef to <8 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r211 = sitofp <8 x i1> undef to <8 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r212 = uitofp <8 x i8> undef to <8 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r213 = sitofp <8 x i8> undef to <8 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r214 = uitofp <8 x i16> undef to <8 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r215 = sitofp <8 x i16> undef to <8 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r216 = uitofp <8 x i32> undef to <8 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r217 = sitofp <8 x i32> undef to <8 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r218 = uitofp <8 x i64> undef to <8 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r219 = sitofp <8 x i64> undef to <8 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r220 = uitofp <8 x i1> undef to <8 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r221 = sitofp <8 x i1> undef to <8 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r222 = uitofp <8 x i8> undef to <8 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r223 = sitofp <8 x i8> undef to <8 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r224 = uitofp <8 x i16> undef to <8 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r225 = sitofp <8 x i16> undef to <8 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r226 = uitofp <8 x i32> undef to <8 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r227 = sitofp <8 x i32> undef to <8 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r228 = uitofp <8 x i64> undef to <8 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r229 = sitofp <8 x i64> undef to <8 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r230 = uitofp <16 x i1> undef to <16 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r231 = sitofp <16 x i1> undef to <16 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r232 = uitofp <16 x i8> undef to <16 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r233 = sitofp <16 x i8> undef to <16 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r234 = uitofp <16 x i16> undef to <16 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r235 = sitofp <16 x i16> undef to <16 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r236 = uitofp <16 x i32> undef to <16 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r237 = sitofp <16 x i32> undef to <16 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r238 = uitofp <16 x i64> undef to <16 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r239 = sitofp <16 x i64> undef to <16 x float>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r240 = uitofp <16 x i1> undef to <16 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r241 = sitofp <16 x i1> undef to <16 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: %r242 = uitofp <16 x i8> undef to <16 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: %r243 = sitofp <16 x i8> undef to <16 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r244 = uitofp <16 x i16> undef to <16 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r245 = sitofp <16 x i16> undef to <16 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r246 = uitofp <16 x i32> undef to <16 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r247 = sitofp <16 x i32> undef to <16 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r248 = uitofp <16 x i64> undef to <16 x double>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r249 = sitofp <16 x i64> undef to <16 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r30 = fptoui float poison to i1
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r31 = fptosi float poison to i1
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r32 = fptoui float poison to i8
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r33 = fptosi float poison to i8
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r34 = fptoui float poison to i16
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r35 = fptosi float poison to i16
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r36 = fptoui float poison to i32
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r37 = fptosi float poison to i32
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r38 = fptoui float poison to i64
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r39 = fptosi float poison to i64
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r40 = fptoui double poison to i1
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r41 = fptosi double poison to i1
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r42 = fptoui double poison to i8
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r43 = fptosi double poison to i8
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r44 = fptoui double poison to i16
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r45 = fptosi double poison to i16
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r46 = fptoui double poison to i32
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r47 = fptosi double poison to i32
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r48 = fptoui double poison to i64
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r49 = fptosi double poison to i64
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r50 = sitofp i1 poison to float
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r51 = uitofp i1 poison to float
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r52 = sitofp i1 poison to double
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r53 = uitofp i1 poison to double
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r54 = sitofp i8 poison to float
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r55 = uitofp i8 poison to float
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r56 = sitofp i8 poison to double
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r57 = uitofp i8 poison to double
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r58 = sitofp i16 poison to float
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r59 = uitofp i16 poison to float
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r60 = sitofp i16 poison to double
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r61 = uitofp i16 poison to double
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r62 = sitofp i32 poison to float
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r63 = uitofp i32 poison to float
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r64 = sitofp i32 poison to double
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r65 = uitofp i32 poison to double
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r66 = sitofp i64 poison to float
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r67 = uitofp i64 poison to float
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r68 = sitofp i64 poison to double
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r69 = uitofp i64 poison to double
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r80 = fptrunc double poison to float
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r81 = fptrunc <2 x double> poison to <2 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r82 = fptrunc <4 x double> poison to <4 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r83 = fptrunc <8 x double> poison to <8 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r84 = fptrunc <16 x double> poison to <16 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r85 = fpext float poison to double
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r86 = fpext <2 x float> poison to <2 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r87 = fpext <4 x float> poison to <4 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r88 = fpext <8 x float> poison to <8 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r89 = fpext <16 x float> poison to <16 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r90 = fptoui <2 x float> poison to <2 x i1>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r91 = fptosi <2 x float> poison to <2 x i1>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r92 = fptoui <2 x float> poison to <2 x i8>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r93 = fptosi <2 x float> poison to <2 x i8>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r94 = fptoui <2 x float> poison to <2 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r95 = fptosi <2 x float> poison to <2 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r96 = fptoui <2 x float> poison to <2 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r97 = fptosi <2 x float> poison to <2 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r98 = fptoui <2 x float> poison to <2 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r99 = fptosi <2 x float> poison to <2 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r100 = fptoui <2 x double> poison to <2 x i1>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r101 = fptosi <2 x double> poison to <2 x i1>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r102 = fptoui <2 x double> poison to <2 x i8>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r103 = fptosi <2 x double> poison to <2 x i8>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r104 = fptoui <2 x double> poison to <2 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r105 = fptosi <2 x double> poison to <2 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r106 = fptoui <2 x double> poison to <2 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r107 = fptosi <2 x double> poison to <2 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r108 = fptoui <2 x double> poison to <2 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r109 = fptosi <2 x double> poison to <2 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r110 = fptoui <4 x float> poison to <4 x i1>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r111 = fptosi <4 x float> poison to <4 x i1>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r112 = fptoui <4 x float> poison to <4 x i8>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r113 = fptosi <4 x float> poison to <4 x i8>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r114 = fptoui <4 x float> poison to <4 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r115 = fptosi <4 x float> poison to <4 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r116 = fptoui <4 x float> poison to <4 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r117 = fptosi <4 x float> poison to <4 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r118 = fptoui <4 x float> poison to <4 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r119 = fptosi <4 x float> poison to <4 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r120 = fptoui <4 x double> poison to <4 x i1>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r121 = fptosi <4 x double> poison to <4 x i1>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r122 = fptoui <4 x double> poison to <4 x i8>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r123 = fptosi <4 x double> poison to <4 x i8>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r124 = fptoui <4 x double> poison to <4 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r125 = fptosi <4 x double> poison to <4 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r126 = fptoui <4 x double> poison to <4 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r127 = fptosi <4 x double> poison to <4 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r128 = fptoui <4 x double> poison to <4 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r129 = fptosi <4 x double> poison to <4 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r130 = fptoui <8 x float> poison to <8 x i1>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r131 = fptosi <8 x float> poison to <8 x i1>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r132 = fptoui <8 x float> poison to <8 x i8>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r133 = fptosi <8 x float> poison to <8 x i8>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r134 = fptoui <8 x float> poison to <8 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r135 = fptosi <8 x float> poison to <8 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r136 = fptoui <8 x float> poison to <8 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r137 = fptosi <8 x float> poison to <8 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x float> poison to <8 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x float> poison to <8 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r140 = fptoui <8 x double> poison to <8 x i1>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r141 = fptosi <8 x double> poison to <8 x i1>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r142 = fptoui <8 x double> poison to <8 x i8>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r143 = fptosi <8 x double> poison to <8 x i8>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r144 = fptoui <8 x double> poison to <8 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r145 = fptosi <8 x double> poison to <8 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r146 = fptoui <8 x double> poison to <8 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r147 = fptosi <8 x double> poison to <8 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r148 = fptoui <8 x double> poison to <8 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r149 = fptosi <8 x double> poison to <8 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r150 = fptoui <16 x float> poison to <16 x i1>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r151 = fptosi <16 x float> poison to <16 x i1>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r152 = fptoui <16 x float> poison to <16 x i8>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r153 = fptosi <16 x float> poison to <16 x i8>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r154 = fptoui <16 x float> poison to <16 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r155 = fptosi <16 x float> poison to <16 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x float> poison to <16 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x float> poison to <16 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x float> poison to <16 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x float> poison to <16 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r160 = fptoui <16 x double> poison to <16 x i1>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r161 = fptosi <16 x double> poison to <16 x i1>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r162 = fptoui <16 x double> poison to <16 x i8>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r163 = fptosi <16 x double> poison to <16 x i8>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r164 = fptoui <16 x double> poison to <16 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r165 = fptosi <16 x double> poison to <16 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r166 = fptoui <16 x double> poison to <16 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r167 = fptosi <16 x double> poison to <16 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r168 = fptoui <16 x double> poison to <16 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r169 = fptosi <16 x double> poison to <16 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r170 = uitofp <2 x i1> poison to <2 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r171 = sitofp <2 x i1> poison to <2 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r172 = uitofp <2 x i8> poison to <2 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r173 = sitofp <2 x i8> poison to <2 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r174 = uitofp <2 x i16> poison to <2 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r175 = sitofp <2 x i16> poison to <2 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r176 = uitofp <2 x i32> poison to <2 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r177 = sitofp <2 x i32> poison to <2 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r178 = uitofp <2 x i64> poison to <2 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r179 = sitofp <2 x i64> poison to <2 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r180 = uitofp <2 x i1> poison to <2 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r181 = sitofp <2 x i1> poison to <2 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r182 = uitofp <2 x i8> poison to <2 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r183 = sitofp <2 x i8> poison to <2 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r184 = uitofp <2 x i16> poison to <2 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r185 = sitofp <2 x i16> poison to <2 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r186 = uitofp <2 x i32> poison to <2 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r187 = sitofp <2 x i32> poison to <2 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r188 = uitofp <2 x i64> poison to <2 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r189 = sitofp <2 x i64> poison to <2 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r190 = uitofp <4 x i1> poison to <4 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r191 = sitofp <4 x i1> poison to <4 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r192 = uitofp <4 x i8> poison to <4 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r193 = sitofp <4 x i8> poison to <4 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r194 = uitofp <4 x i16> poison to <4 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r195 = sitofp <4 x i16> poison to <4 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r196 = uitofp <4 x i32> poison to <4 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r197 = sitofp <4 x i32> poison to <4 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r198 = uitofp <4 x i64> poison to <4 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r199 = sitofp <4 x i64> poison to <4 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r200 = uitofp <4 x i1> poison to <4 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r201 = sitofp <4 x i1> poison to <4 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r202 = uitofp <4 x i8> poison to <4 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r203 = sitofp <4 x i8> poison to <4 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r204 = uitofp <4 x i16> poison to <4 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r205 = sitofp <4 x i16> poison to <4 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r206 = uitofp <4 x i32> poison to <4 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r207 = sitofp <4 x i32> poison to <4 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r208 = uitofp <4 x i64> poison to <4 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r209 = sitofp <4 x i64> poison to <4 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r210 = uitofp <8 x i1> poison to <8 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r211 = sitofp <8 x i1> poison to <8 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r212 = uitofp <8 x i8> poison to <8 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r213 = sitofp <8 x i8> poison to <8 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r214 = uitofp <8 x i16> poison to <8 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r215 = sitofp <8 x i16> poison to <8 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r216 = uitofp <8 x i32> poison to <8 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r217 = sitofp <8 x i32> poison to <8 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r218 = uitofp <8 x i64> poison to <8 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r219 = sitofp <8 x i64> poison to <8 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r220 = uitofp <8 x i1> poison to <8 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r221 = sitofp <8 x i1> poison to <8 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r222 = uitofp <8 x i8> poison to <8 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r223 = sitofp <8 x i8> poison to <8 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r224 = uitofp <8 x i16> poison to <8 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r225 = sitofp <8 x i16> poison to <8 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r226 = uitofp <8 x i32> poison to <8 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r227 = sitofp <8 x i32> poison to <8 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r228 = uitofp <8 x i64> poison to <8 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r229 = sitofp <8 x i64> poison to <8 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r230 = uitofp <16 x i1> poison to <16 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r231 = sitofp <16 x i1> poison to <16 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r232 = uitofp <16 x i8> poison to <16 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r233 = sitofp <16 x i8> poison to <16 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r234 = uitofp <16 x i16> poison to <16 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r235 = sitofp <16 x i16> poison to <16 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r236 = uitofp <16 x i32> poison to <16 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r237 = sitofp <16 x i32> poison to <16 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r238 = uitofp <16 x i64> poison to <16 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r239 = sitofp <16 x i64> poison to <16 x float>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r240 = uitofp <16 x i1> poison to <16 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r241 = sitofp <16 x i1> poison to <16 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: %r242 = uitofp <16 x i8> poison to <16 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: %r243 = sitofp <16 x i8> poison to <16 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r244 = uitofp <16 x i16> poison to <16 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r245 = sitofp <16 x i16> poison to <16 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r246 = uitofp <16 x i32> poison to <16 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r247 = sitofp <16 x i32> poison to <16 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r248 = uitofp <16 x i64> poison to <16 x double>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r249 = sitofp <16 x i64> poison to <16 x double>
; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
;
; FIXED-MIN-256-LABEL: 'casts_no_users'
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r30 = fptoui float undef to i1
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r31 = fptosi float undef to i1
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r32 = fptoui float undef to i8
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r33 = fptosi float undef to i8
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r34 = fptoui float undef to i16
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r35 = fptosi float undef to i16
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r36 = fptoui float undef to i32
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r37 = fptosi float undef to i32
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r38 = fptoui float undef to i64
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r39 = fptosi float undef to i64
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r40 = fptoui double undef to i1
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r41 = fptosi double undef to i1
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r42 = fptoui double undef to i8
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r43 = fptosi double undef to i8
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r44 = fptoui double undef to i16
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r45 = fptosi double undef to i16
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r46 = fptoui double undef to i32
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r47 = fptosi double undef to i32
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r48 = fptoui double undef to i64
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r49 = fptosi double undef to i64
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r50 = sitofp i1 undef to float
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r51 = uitofp i1 undef to float
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r52 = sitofp i1 undef to double
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r53 = uitofp i1 undef to double
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r54 = sitofp i8 undef to float
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r55 = uitofp i8 undef to float
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r56 = sitofp i8 undef to double
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r57 = uitofp i8 undef to double
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r58 = sitofp i16 undef to float
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r59 = uitofp i16 undef to float
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r60 = sitofp i16 undef to double
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r61 = uitofp i16 undef to double
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r62 = sitofp i32 undef to float
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r63 = uitofp i32 undef to float
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r64 = sitofp i32 undef to double
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r65 = uitofp i32 undef to double
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r66 = sitofp i64 undef to float
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r67 = uitofp i64 undef to float
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r68 = sitofp i64 undef to double
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r69 = uitofp i64 undef to double
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r80 = fptrunc double undef to float
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r81 = fptrunc <2 x double> undef to <2 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r82 = fptrunc <4 x double> undef to <4 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r83 = fptrunc <8 x double> undef to <8 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r84 = fptrunc <16 x double> undef to <16 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r85 = fpext float undef to double
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r86 = fpext <2 x float> undef to <2 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r87 = fpext <4 x float> undef to <4 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r88 = fpext <8 x float> undef to <8 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r89 = fpext <16 x float> undef to <16 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r90 = fptoui <2 x float> undef to <2 x i1>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r91 = fptosi <2 x float> undef to <2 x i1>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r92 = fptoui <2 x float> undef to <2 x i8>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r93 = fptosi <2 x float> undef to <2 x i8>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r94 = fptoui <2 x float> undef to <2 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r95 = fptosi <2 x float> undef to <2 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r96 = fptoui <2 x float> undef to <2 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r97 = fptosi <2 x float> undef to <2 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x float> undef to <2 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x float> undef to <2 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r100 = fptoui <2 x double> undef to <2 x i1>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r101 = fptosi <2 x double> undef to <2 x i1>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r102 = fptoui <2 x double> undef to <2 x i8>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r103 = fptosi <2 x double> undef to <2 x i8>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r104 = fptoui <2 x double> undef to <2 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r105 = fptosi <2 x double> undef to <2 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r106 = fptoui <2 x double> undef to <2 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r107 = fptosi <2 x double> undef to <2 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r108 = fptoui <2 x double> undef to <2 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r109 = fptosi <2 x double> undef to <2 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r110 = fptoui <4 x float> undef to <4 x i1>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r111 = fptosi <4 x float> undef to <4 x i1>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r112 = fptoui <4 x float> undef to <4 x i8>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r113 = fptosi <4 x float> undef to <4 x i8>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r114 = fptoui <4 x float> undef to <4 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r115 = fptosi <4 x float> undef to <4 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r116 = fptoui <4 x float> undef to <4 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r117 = fptosi <4 x float> undef to <4 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r118 = fptoui <4 x float> undef to <4 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r119 = fptosi <4 x float> undef to <4 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r120 = fptoui <4 x double> undef to <4 x i1>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r121 = fptosi <4 x double> undef to <4 x i1>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r122 = fptoui <4 x double> undef to <4 x i8>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r123 = fptosi <4 x double> undef to <4 x i8>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r124 = fptoui <4 x double> undef to <4 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r125 = fptosi <4 x double> undef to <4 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r126 = fptoui <4 x double> undef to <4 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r127 = fptosi <4 x double> undef to <4 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r128 = fptoui <4 x double> undef to <4 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r129 = fptosi <4 x double> undef to <4 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r130 = fptoui <8 x float> undef to <8 x i1>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r131 = fptosi <8 x float> undef to <8 x i1>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r132 = fptoui <8 x float> undef to <8 x i8>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r133 = fptosi <8 x float> undef to <8 x i8>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r134 = fptoui <8 x float> undef to <8 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r135 = fptosi <8 x float> undef to <8 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r136 = fptoui <8 x float> undef to <8 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r137 = fptosi <8 x float> undef to <8 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x float> undef to <8 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x float> undef to <8 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r140 = fptoui <8 x double> undef to <8 x i1>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r141 = fptosi <8 x double> undef to <8 x i1>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r142 = fptoui <8 x double> undef to <8 x i8>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r143 = fptosi <8 x double> undef to <8 x i8>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r144 = fptoui <8 x double> undef to <8 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r145 = fptosi <8 x double> undef to <8 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r146 = fptoui <8 x double> undef to <8 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r147 = fptosi <8 x double> undef to <8 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r148 = fptoui <8 x double> undef to <8 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r149 = fptosi <8 x double> undef to <8 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r150 = fptoui <16 x float> undef to <16 x i1>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r151 = fptosi <16 x float> undef to <16 x i1>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r152 = fptoui <16 x float> undef to <16 x i8>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r153 = fptosi <16 x float> undef to <16 x i8>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r154 = fptoui <16 x float> undef to <16 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r155 = fptosi <16 x float> undef to <16 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x float> undef to <16 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x float> undef to <16 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x float> undef to <16 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x float> undef to <16 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r160 = fptoui <16 x double> undef to <16 x i1>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r161 = fptosi <16 x double> undef to <16 x i1>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r162 = fptoui <16 x double> undef to <16 x i8>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r163 = fptosi <16 x double> undef to <16 x i8>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r164 = fptoui <16 x double> undef to <16 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r165 = fptosi <16 x double> undef to <16 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r166 = fptoui <16 x double> undef to <16 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r167 = fptosi <16 x double> undef to <16 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r168 = fptoui <16 x double> undef to <16 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r169 = fptosi <16 x double> undef to <16 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r170 = uitofp <2 x i1> undef to <2 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r171 = sitofp <2 x i1> undef to <2 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r172 = uitofp <2 x i8> undef to <2 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r173 = sitofp <2 x i8> undef to <2 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r174 = uitofp <2 x i16> undef to <2 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r175 = sitofp <2 x i16> undef to <2 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r176 = uitofp <2 x i32> undef to <2 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r177 = sitofp <2 x i32> undef to <2 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r178 = uitofp <2 x i64> undef to <2 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r179 = sitofp <2 x i64> undef to <2 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r180 = uitofp <2 x i1> undef to <2 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r181 = sitofp <2 x i1> undef to <2 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r182 = uitofp <2 x i8> undef to <2 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r183 = sitofp <2 x i8> undef to <2 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r184 = uitofp <2 x i16> undef to <2 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r185 = sitofp <2 x i16> undef to <2 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r186 = uitofp <2 x i32> undef to <2 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r187 = sitofp <2 x i32> undef to <2 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r188 = uitofp <2 x i64> undef to <2 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r189 = sitofp <2 x i64> undef to <2 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r190 = uitofp <4 x i1> undef to <4 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r191 = sitofp <4 x i1> undef to <4 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r192 = uitofp <4 x i8> undef to <4 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r193 = sitofp <4 x i8> undef to <4 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r194 = uitofp <4 x i16> undef to <4 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r195 = sitofp <4 x i16> undef to <4 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r196 = uitofp <4 x i32> undef to <4 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r197 = sitofp <4 x i32> undef to <4 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r198 = uitofp <4 x i64> undef to <4 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r199 = sitofp <4 x i64> undef to <4 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r200 = uitofp <4 x i1> undef to <4 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r201 = sitofp <4 x i1> undef to <4 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r202 = uitofp <4 x i8> undef to <4 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r203 = sitofp <4 x i8> undef to <4 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r204 = uitofp <4 x i16> undef to <4 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r205 = sitofp <4 x i16> undef to <4 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r206 = uitofp <4 x i32> undef to <4 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r207 = sitofp <4 x i32> undef to <4 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r208 = uitofp <4 x i64> undef to <4 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r209 = sitofp <4 x i64> undef to <4 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r210 = uitofp <8 x i1> undef to <8 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r211 = sitofp <8 x i1> undef to <8 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r212 = uitofp <8 x i8> undef to <8 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r213 = sitofp <8 x i8> undef to <8 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r214 = uitofp <8 x i16> undef to <8 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r215 = sitofp <8 x i16> undef to <8 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r216 = uitofp <8 x i32> undef to <8 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r217 = sitofp <8 x i32> undef to <8 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r218 = uitofp <8 x i64> undef to <8 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r219 = sitofp <8 x i64> undef to <8 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r220 = uitofp <8 x i1> undef to <8 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r221 = sitofp <8 x i1> undef to <8 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r222 = uitofp <8 x i8> undef to <8 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r223 = sitofp <8 x i8> undef to <8 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r224 = uitofp <8 x i16> undef to <8 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r225 = sitofp <8 x i16> undef to <8 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r226 = uitofp <8 x i32> undef to <8 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r227 = sitofp <8 x i32> undef to <8 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r228 = uitofp <8 x i64> undef to <8 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r229 = sitofp <8 x i64> undef to <8 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r230 = uitofp <16 x i1> undef to <16 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r231 = sitofp <16 x i1> undef to <16 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r232 = uitofp <16 x i8> undef to <16 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r233 = sitofp <16 x i8> undef to <16 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r234 = uitofp <16 x i16> undef to <16 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r235 = sitofp <16 x i16> undef to <16 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r236 = uitofp <16 x i32> undef to <16 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r237 = sitofp <16 x i32> undef to <16 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r238 = uitofp <16 x i64> undef to <16 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r239 = sitofp <16 x i64> undef to <16 x float>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r240 = uitofp <16 x i1> undef to <16 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r241 = sitofp <16 x i1> undef to <16 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r242 = uitofp <16 x i8> undef to <16 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r243 = sitofp <16 x i8> undef to <16 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r244 = uitofp <16 x i16> undef to <16 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r245 = sitofp <16 x i16> undef to <16 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r246 = uitofp <16 x i32> undef to <16 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r247 = sitofp <16 x i32> undef to <16 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r248 = uitofp <16 x i64> undef to <16 x double>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r249 = sitofp <16 x i64> undef to <16 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r30 = fptoui float poison to i1
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r31 = fptosi float poison to i1
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r32 = fptoui float poison to i8
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r33 = fptosi float poison to i8
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r34 = fptoui float poison to i16
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r35 = fptosi float poison to i16
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r36 = fptoui float poison to i32
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r37 = fptosi float poison to i32
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r38 = fptoui float poison to i64
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r39 = fptosi float poison to i64
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r40 = fptoui double poison to i1
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r41 = fptosi double poison to i1
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r42 = fptoui double poison to i8
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r43 = fptosi double poison to i8
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r44 = fptoui double poison to i16
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r45 = fptosi double poison to i16
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r46 = fptoui double poison to i32
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r47 = fptosi double poison to i32
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r48 = fptoui double poison to i64
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r49 = fptosi double poison to i64
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r50 = sitofp i1 poison to float
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r51 = uitofp i1 poison to float
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r52 = sitofp i1 poison to double
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r53 = uitofp i1 poison to double
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r54 = sitofp i8 poison to float
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r55 = uitofp i8 poison to float
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r56 = sitofp i8 poison to double
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r57 = uitofp i8 poison to double
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r58 = sitofp i16 poison to float
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r59 = uitofp i16 poison to float
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r60 = sitofp i16 poison to double
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r61 = uitofp i16 poison to double
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r62 = sitofp i32 poison to float
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r63 = uitofp i32 poison to float
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r64 = sitofp i32 poison to double
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r65 = uitofp i32 poison to double
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r66 = sitofp i64 poison to float
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r67 = uitofp i64 poison to float
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r68 = sitofp i64 poison to double
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r69 = uitofp i64 poison to double
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r80 = fptrunc double poison to float
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r81 = fptrunc <2 x double> poison to <2 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r82 = fptrunc <4 x double> poison to <4 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r83 = fptrunc <8 x double> poison to <8 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r84 = fptrunc <16 x double> poison to <16 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r85 = fpext float poison to double
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r86 = fpext <2 x float> poison to <2 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r87 = fpext <4 x float> poison to <4 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r88 = fpext <8 x float> poison to <8 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r89 = fpext <16 x float> poison to <16 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r90 = fptoui <2 x float> poison to <2 x i1>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r91 = fptosi <2 x float> poison to <2 x i1>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r92 = fptoui <2 x float> poison to <2 x i8>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r93 = fptosi <2 x float> poison to <2 x i8>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r94 = fptoui <2 x float> poison to <2 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r95 = fptosi <2 x float> poison to <2 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r96 = fptoui <2 x float> poison to <2 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r97 = fptosi <2 x float> poison to <2 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x float> poison to <2 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x float> poison to <2 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r100 = fptoui <2 x double> poison to <2 x i1>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r101 = fptosi <2 x double> poison to <2 x i1>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r102 = fptoui <2 x double> poison to <2 x i8>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r103 = fptosi <2 x double> poison to <2 x i8>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r104 = fptoui <2 x double> poison to <2 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r105 = fptosi <2 x double> poison to <2 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r106 = fptoui <2 x double> poison to <2 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r107 = fptosi <2 x double> poison to <2 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r108 = fptoui <2 x double> poison to <2 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r109 = fptosi <2 x double> poison to <2 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r110 = fptoui <4 x float> poison to <4 x i1>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r111 = fptosi <4 x float> poison to <4 x i1>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r112 = fptoui <4 x float> poison to <4 x i8>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r113 = fptosi <4 x float> poison to <4 x i8>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r114 = fptoui <4 x float> poison to <4 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r115 = fptosi <4 x float> poison to <4 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r116 = fptoui <4 x float> poison to <4 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r117 = fptosi <4 x float> poison to <4 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r118 = fptoui <4 x float> poison to <4 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r119 = fptosi <4 x float> poison to <4 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r120 = fptoui <4 x double> poison to <4 x i1>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r121 = fptosi <4 x double> poison to <4 x i1>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r122 = fptoui <4 x double> poison to <4 x i8>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r123 = fptosi <4 x double> poison to <4 x i8>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r124 = fptoui <4 x double> poison to <4 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r125 = fptosi <4 x double> poison to <4 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r126 = fptoui <4 x double> poison to <4 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r127 = fptosi <4 x double> poison to <4 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r128 = fptoui <4 x double> poison to <4 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r129 = fptosi <4 x double> poison to <4 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r130 = fptoui <8 x float> poison to <8 x i1>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r131 = fptosi <8 x float> poison to <8 x i1>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r132 = fptoui <8 x float> poison to <8 x i8>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r133 = fptosi <8 x float> poison to <8 x i8>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r134 = fptoui <8 x float> poison to <8 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r135 = fptosi <8 x float> poison to <8 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r136 = fptoui <8 x float> poison to <8 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r137 = fptosi <8 x float> poison to <8 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x float> poison to <8 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x float> poison to <8 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r140 = fptoui <8 x double> poison to <8 x i1>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r141 = fptosi <8 x double> poison to <8 x i1>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r142 = fptoui <8 x double> poison to <8 x i8>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r143 = fptosi <8 x double> poison to <8 x i8>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r144 = fptoui <8 x double> poison to <8 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r145 = fptosi <8 x double> poison to <8 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r146 = fptoui <8 x double> poison to <8 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r147 = fptosi <8 x double> poison to <8 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r148 = fptoui <8 x double> poison to <8 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r149 = fptosi <8 x double> poison to <8 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r150 = fptoui <16 x float> poison to <16 x i1>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r151 = fptosi <16 x float> poison to <16 x i1>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r152 = fptoui <16 x float> poison to <16 x i8>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r153 = fptosi <16 x float> poison to <16 x i8>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r154 = fptoui <16 x float> poison to <16 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r155 = fptosi <16 x float> poison to <16 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x float> poison to <16 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x float> poison to <16 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x float> poison to <16 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x float> poison to <16 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r160 = fptoui <16 x double> poison to <16 x i1>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r161 = fptosi <16 x double> poison to <16 x i1>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r162 = fptoui <16 x double> poison to <16 x i8>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r163 = fptosi <16 x double> poison to <16 x i8>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r164 = fptoui <16 x double> poison to <16 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r165 = fptosi <16 x double> poison to <16 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r166 = fptoui <16 x double> poison to <16 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r167 = fptosi <16 x double> poison to <16 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r168 = fptoui <16 x double> poison to <16 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r169 = fptosi <16 x double> poison to <16 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r170 = uitofp <2 x i1> poison to <2 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r171 = sitofp <2 x i1> poison to <2 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r172 = uitofp <2 x i8> poison to <2 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r173 = sitofp <2 x i8> poison to <2 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r174 = uitofp <2 x i16> poison to <2 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r175 = sitofp <2 x i16> poison to <2 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r176 = uitofp <2 x i32> poison to <2 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r177 = sitofp <2 x i32> poison to <2 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r178 = uitofp <2 x i64> poison to <2 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r179 = sitofp <2 x i64> poison to <2 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r180 = uitofp <2 x i1> poison to <2 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r181 = sitofp <2 x i1> poison to <2 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r182 = uitofp <2 x i8> poison to <2 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r183 = sitofp <2 x i8> poison to <2 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r184 = uitofp <2 x i16> poison to <2 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r185 = sitofp <2 x i16> poison to <2 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r186 = uitofp <2 x i32> poison to <2 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r187 = sitofp <2 x i32> poison to <2 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r188 = uitofp <2 x i64> poison to <2 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r189 = sitofp <2 x i64> poison to <2 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r190 = uitofp <4 x i1> poison to <4 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r191 = sitofp <4 x i1> poison to <4 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r192 = uitofp <4 x i8> poison to <4 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r193 = sitofp <4 x i8> poison to <4 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r194 = uitofp <4 x i16> poison to <4 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r195 = sitofp <4 x i16> poison to <4 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r196 = uitofp <4 x i32> poison to <4 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r197 = sitofp <4 x i32> poison to <4 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r198 = uitofp <4 x i64> poison to <4 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r199 = sitofp <4 x i64> poison to <4 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r200 = uitofp <4 x i1> poison to <4 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r201 = sitofp <4 x i1> poison to <4 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r202 = uitofp <4 x i8> poison to <4 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r203 = sitofp <4 x i8> poison to <4 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r204 = uitofp <4 x i16> poison to <4 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r205 = sitofp <4 x i16> poison to <4 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r206 = uitofp <4 x i32> poison to <4 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r207 = sitofp <4 x i32> poison to <4 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r208 = uitofp <4 x i64> poison to <4 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r209 = sitofp <4 x i64> poison to <4 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r210 = uitofp <8 x i1> poison to <8 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r211 = sitofp <8 x i1> poison to <8 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r212 = uitofp <8 x i8> poison to <8 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r213 = sitofp <8 x i8> poison to <8 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r214 = uitofp <8 x i16> poison to <8 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r215 = sitofp <8 x i16> poison to <8 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r216 = uitofp <8 x i32> poison to <8 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r217 = sitofp <8 x i32> poison to <8 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r218 = uitofp <8 x i64> poison to <8 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r219 = sitofp <8 x i64> poison to <8 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r220 = uitofp <8 x i1> poison to <8 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r221 = sitofp <8 x i1> poison to <8 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r222 = uitofp <8 x i8> poison to <8 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r223 = sitofp <8 x i8> poison to <8 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r224 = uitofp <8 x i16> poison to <8 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r225 = sitofp <8 x i16> poison to <8 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r226 = uitofp <8 x i32> poison to <8 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r227 = sitofp <8 x i32> poison to <8 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r228 = uitofp <8 x i64> poison to <8 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r229 = sitofp <8 x i64> poison to <8 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r230 = uitofp <16 x i1> poison to <16 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r231 = sitofp <16 x i1> poison to <16 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r232 = uitofp <16 x i8> poison to <16 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r233 = sitofp <16 x i8> poison to <16 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r234 = uitofp <16 x i16> poison to <16 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r235 = sitofp <16 x i16> poison to <16 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r236 = uitofp <16 x i32> poison to <16 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r237 = sitofp <16 x i32> poison to <16 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r238 = uitofp <16 x i64> poison to <16 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r239 = sitofp <16 x i64> poison to <16 x float>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r240 = uitofp <16 x i1> poison to <16 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r241 = sitofp <16 x i1> poison to <16 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r242 = uitofp <16 x i8> poison to <16 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r243 = sitofp <16 x i8> poison to <16 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r244 = uitofp <16 x i16> poison to <16 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r245 = sitofp <16 x i16> poison to <16 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r246 = uitofp <16 x i32> poison to <16 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r247 = sitofp <16 x i32> poison to <16 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r248 = uitofp <16 x i64> poison to <16 x double>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r249 = sitofp <16 x i64> poison to <16 x double>
; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
;
; FIXED-MIN-2048-LABEL: 'casts_no_users'
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r30 = fptoui float undef to i1
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r31 = fptosi float undef to i1
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r32 = fptoui float undef to i8
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r33 = fptosi float undef to i8
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r34 = fptoui float undef to i16
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r35 = fptosi float undef to i16
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r36 = fptoui float undef to i32
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r37 = fptosi float undef to i32
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r38 = fptoui float undef to i64
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r39 = fptosi float undef to i64
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r40 = fptoui double undef to i1
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r41 = fptosi double undef to i1
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r42 = fptoui double undef to i8
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r43 = fptosi double undef to i8
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r44 = fptoui double undef to i16
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r45 = fptosi double undef to i16
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r46 = fptoui double undef to i32
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r47 = fptosi double undef to i32
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r48 = fptoui double undef to i64
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r49 = fptosi double undef to i64
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r50 = sitofp i1 undef to float
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r51 = uitofp i1 undef to float
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r52 = sitofp i1 undef to double
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r53 = uitofp i1 undef to double
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r54 = sitofp i8 undef to float
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r55 = uitofp i8 undef to float
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r56 = sitofp i8 undef to double
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r57 = uitofp i8 undef to double
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r58 = sitofp i16 undef to float
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r59 = uitofp i16 undef to float
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r60 = sitofp i16 undef to double
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r61 = uitofp i16 undef to double
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r62 = sitofp i32 undef to float
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r63 = uitofp i32 undef to float
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r64 = sitofp i32 undef to double
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r65 = uitofp i32 undef to double
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r66 = sitofp i64 undef to float
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r67 = uitofp i64 undef to float
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r68 = sitofp i64 undef to double
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r69 = uitofp i64 undef to double
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r80 = fptrunc double undef to float
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r81 = fptrunc <2 x double> undef to <2 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r82 = fptrunc <4 x double> undef to <4 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r83 = fptrunc <8 x double> undef to <8 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r84 = fptrunc <16 x double> undef to <16 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r85 = fpext float undef to double
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r86 = fpext <2 x float> undef to <2 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r87 = fpext <4 x float> undef to <4 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r88 = fpext <8 x float> undef to <8 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r89 = fpext <16 x float> undef to <16 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r90 = fptoui <2 x float> undef to <2 x i1>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r91 = fptosi <2 x float> undef to <2 x i1>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r92 = fptoui <2 x float> undef to <2 x i8>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r93 = fptosi <2 x float> undef to <2 x i8>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r94 = fptoui <2 x float> undef to <2 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r95 = fptosi <2 x float> undef to <2 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r96 = fptoui <2 x float> undef to <2 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r97 = fptosi <2 x float> undef to <2 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x float> undef to <2 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x float> undef to <2 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r100 = fptoui <2 x double> undef to <2 x i1>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r101 = fptosi <2 x double> undef to <2 x i1>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r102 = fptoui <2 x double> undef to <2 x i8>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r103 = fptosi <2 x double> undef to <2 x i8>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r104 = fptoui <2 x double> undef to <2 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r105 = fptosi <2 x double> undef to <2 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r106 = fptoui <2 x double> undef to <2 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r107 = fptosi <2 x double> undef to <2 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r108 = fptoui <2 x double> undef to <2 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r109 = fptosi <2 x double> undef to <2 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r110 = fptoui <4 x float> undef to <4 x i1>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r111 = fptosi <4 x float> undef to <4 x i1>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r112 = fptoui <4 x float> undef to <4 x i8>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r113 = fptosi <4 x float> undef to <4 x i8>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r114 = fptoui <4 x float> undef to <4 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r115 = fptosi <4 x float> undef to <4 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r116 = fptoui <4 x float> undef to <4 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r117 = fptosi <4 x float> undef to <4 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r118 = fptoui <4 x float> undef to <4 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r119 = fptosi <4 x float> undef to <4 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r120 = fptoui <4 x double> undef to <4 x i1>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r121 = fptosi <4 x double> undef to <4 x i1>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r122 = fptoui <4 x double> undef to <4 x i8>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r123 = fptosi <4 x double> undef to <4 x i8>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r124 = fptoui <4 x double> undef to <4 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r125 = fptosi <4 x double> undef to <4 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r126 = fptoui <4 x double> undef to <4 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r127 = fptosi <4 x double> undef to <4 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r128 = fptoui <4 x double> undef to <4 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r129 = fptosi <4 x double> undef to <4 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r130 = fptoui <8 x float> undef to <8 x i1>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r131 = fptosi <8 x float> undef to <8 x i1>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r132 = fptoui <8 x float> undef to <8 x i8>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r133 = fptosi <8 x float> undef to <8 x i8>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r134 = fptoui <8 x float> undef to <8 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r135 = fptosi <8 x float> undef to <8 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r136 = fptoui <8 x float> undef to <8 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r137 = fptosi <8 x float> undef to <8 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r138 = fptoui <8 x float> undef to <8 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r139 = fptosi <8 x float> undef to <8 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r140 = fptoui <8 x double> undef to <8 x i1>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r141 = fptosi <8 x double> undef to <8 x i1>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r142 = fptoui <8 x double> undef to <8 x i8>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r143 = fptosi <8 x double> undef to <8 x i8>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r144 = fptoui <8 x double> undef to <8 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r145 = fptosi <8 x double> undef to <8 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r146 = fptoui <8 x double> undef to <8 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r147 = fptosi <8 x double> undef to <8 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r148 = fptoui <8 x double> undef to <8 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r149 = fptosi <8 x double> undef to <8 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r150 = fptoui <16 x float> undef to <16 x i1>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r151 = fptosi <16 x float> undef to <16 x i1>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r152 = fptoui <16 x float> undef to <16 x i8>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r153 = fptosi <16 x float> undef to <16 x i8>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r154 = fptoui <16 x float> undef to <16 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r155 = fptosi <16 x float> undef to <16 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r156 = fptoui <16 x float> undef to <16 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r157 = fptosi <16 x float> undef to <16 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r158 = fptoui <16 x float> undef to <16 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r159 = fptosi <16 x float> undef to <16 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r160 = fptoui <16 x double> undef to <16 x i1>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r161 = fptosi <16 x double> undef to <16 x i1>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r162 = fptoui <16 x double> undef to <16 x i8>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r163 = fptosi <16 x double> undef to <16 x i8>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r164 = fptoui <16 x double> undef to <16 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r165 = fptosi <16 x double> undef to <16 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r166 = fptoui <16 x double> undef to <16 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r167 = fptosi <16 x double> undef to <16 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r168 = fptoui <16 x double> undef to <16 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r169 = fptosi <16 x double> undef to <16 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r170 = uitofp <2 x i1> undef to <2 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r171 = sitofp <2 x i1> undef to <2 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r172 = uitofp <2 x i8> undef to <2 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r173 = sitofp <2 x i8> undef to <2 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r174 = uitofp <2 x i16> undef to <2 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r175 = sitofp <2 x i16> undef to <2 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r176 = uitofp <2 x i32> undef to <2 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r177 = sitofp <2 x i32> undef to <2 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r178 = uitofp <2 x i64> undef to <2 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r179 = sitofp <2 x i64> undef to <2 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r180 = uitofp <2 x i1> undef to <2 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r181 = sitofp <2 x i1> undef to <2 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r182 = uitofp <2 x i8> undef to <2 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r183 = sitofp <2 x i8> undef to <2 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r184 = uitofp <2 x i16> undef to <2 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r185 = sitofp <2 x i16> undef to <2 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r186 = uitofp <2 x i32> undef to <2 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r187 = sitofp <2 x i32> undef to <2 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r188 = uitofp <2 x i64> undef to <2 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r189 = sitofp <2 x i64> undef to <2 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r190 = uitofp <4 x i1> undef to <4 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r191 = sitofp <4 x i1> undef to <4 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r192 = uitofp <4 x i8> undef to <4 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r193 = sitofp <4 x i8> undef to <4 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r194 = uitofp <4 x i16> undef to <4 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r195 = sitofp <4 x i16> undef to <4 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r196 = uitofp <4 x i32> undef to <4 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r197 = sitofp <4 x i32> undef to <4 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r198 = uitofp <4 x i64> undef to <4 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r199 = sitofp <4 x i64> undef to <4 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r200 = uitofp <4 x i1> undef to <4 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r201 = sitofp <4 x i1> undef to <4 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r202 = uitofp <4 x i8> undef to <4 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r203 = sitofp <4 x i8> undef to <4 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r204 = uitofp <4 x i16> undef to <4 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r205 = sitofp <4 x i16> undef to <4 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r206 = uitofp <4 x i32> undef to <4 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r207 = sitofp <4 x i32> undef to <4 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r208 = uitofp <4 x i64> undef to <4 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r209 = sitofp <4 x i64> undef to <4 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r210 = uitofp <8 x i1> undef to <8 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r211 = sitofp <8 x i1> undef to <8 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r212 = uitofp <8 x i8> undef to <8 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r213 = sitofp <8 x i8> undef to <8 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r214 = uitofp <8 x i16> undef to <8 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r215 = sitofp <8 x i16> undef to <8 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r216 = uitofp <8 x i32> undef to <8 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r217 = sitofp <8 x i32> undef to <8 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r218 = uitofp <8 x i64> undef to <8 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r219 = sitofp <8 x i64> undef to <8 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r220 = uitofp <8 x i1> undef to <8 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r221 = sitofp <8 x i1> undef to <8 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r222 = uitofp <8 x i8> undef to <8 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r223 = sitofp <8 x i8> undef to <8 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r224 = uitofp <8 x i16> undef to <8 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r225 = sitofp <8 x i16> undef to <8 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r226 = uitofp <8 x i32> undef to <8 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r227 = sitofp <8 x i32> undef to <8 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r228 = uitofp <8 x i64> undef to <8 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r229 = sitofp <8 x i64> undef to <8 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r230 = uitofp <16 x i1> undef to <16 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r231 = sitofp <16 x i1> undef to <16 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r232 = uitofp <16 x i8> undef to <16 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r233 = sitofp <16 x i8> undef to <16 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r234 = uitofp <16 x i16> undef to <16 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r235 = sitofp <16 x i16> undef to <16 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r236 = uitofp <16 x i32> undef to <16 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r237 = sitofp <16 x i32> undef to <16 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r238 = uitofp <16 x i64> undef to <16 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r239 = sitofp <16 x i64> undef to <16 x float>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r240 = uitofp <16 x i1> undef to <16 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r241 = sitofp <16 x i1> undef to <16 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r242 = uitofp <16 x i8> undef to <16 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r243 = sitofp <16 x i8> undef to <16 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r244 = uitofp <16 x i16> undef to <16 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r245 = sitofp <16 x i16> undef to <16 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r246 = uitofp <16 x i32> undef to <16 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r247 = sitofp <16 x i32> undef to <16 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r248 = uitofp <16 x i64> undef to <16 x double>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r249 = sitofp <16 x i64> undef to <16 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r30 = fptoui float poison to i1
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r31 = fptosi float poison to i1
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r32 = fptoui float poison to i8
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r33 = fptosi float poison to i8
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r34 = fptoui float poison to i16
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r35 = fptosi float poison to i16
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r36 = fptoui float poison to i32
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r37 = fptosi float poison to i32
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r38 = fptoui float poison to i64
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r39 = fptosi float poison to i64
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r40 = fptoui double poison to i1
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r41 = fptosi double poison to i1
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r42 = fptoui double poison to i8
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r43 = fptosi double poison to i8
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r44 = fptoui double poison to i16
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r45 = fptosi double poison to i16
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r46 = fptoui double poison to i32
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r47 = fptosi double poison to i32
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r48 = fptoui double poison to i64
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r49 = fptosi double poison to i64
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r50 = sitofp i1 poison to float
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r51 = uitofp i1 poison to float
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r52 = sitofp i1 poison to double
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r53 = uitofp i1 poison to double
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r54 = sitofp i8 poison to float
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r55 = uitofp i8 poison to float
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r56 = sitofp i8 poison to double
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r57 = uitofp i8 poison to double
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r58 = sitofp i16 poison to float
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r59 = uitofp i16 poison to float
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r60 = sitofp i16 poison to double
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r61 = uitofp i16 poison to double
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r62 = sitofp i32 poison to float
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r63 = uitofp i32 poison to float
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r64 = sitofp i32 poison to double
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r65 = uitofp i32 poison to double
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r66 = sitofp i64 poison to float
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r67 = uitofp i64 poison to float
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r68 = sitofp i64 poison to double
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r69 = uitofp i64 poison to double
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r80 = fptrunc double poison to float
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r81 = fptrunc <2 x double> poison to <2 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r82 = fptrunc <4 x double> poison to <4 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r83 = fptrunc <8 x double> poison to <8 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r84 = fptrunc <16 x double> poison to <16 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r85 = fpext float poison to double
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r86 = fpext <2 x float> poison to <2 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r87 = fpext <4 x float> poison to <4 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r88 = fpext <8 x float> poison to <8 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r89 = fpext <16 x float> poison to <16 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r90 = fptoui <2 x float> poison to <2 x i1>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r91 = fptosi <2 x float> poison to <2 x i1>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r92 = fptoui <2 x float> poison to <2 x i8>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r93 = fptosi <2 x float> poison to <2 x i8>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r94 = fptoui <2 x float> poison to <2 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r95 = fptosi <2 x float> poison to <2 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r96 = fptoui <2 x float> poison to <2 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r97 = fptosi <2 x float> poison to <2 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x float> poison to <2 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x float> poison to <2 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r100 = fptoui <2 x double> poison to <2 x i1>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r101 = fptosi <2 x double> poison to <2 x i1>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r102 = fptoui <2 x double> poison to <2 x i8>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r103 = fptosi <2 x double> poison to <2 x i8>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r104 = fptoui <2 x double> poison to <2 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r105 = fptosi <2 x double> poison to <2 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r106 = fptoui <2 x double> poison to <2 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r107 = fptosi <2 x double> poison to <2 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r108 = fptoui <2 x double> poison to <2 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r109 = fptosi <2 x double> poison to <2 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r110 = fptoui <4 x float> poison to <4 x i1>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r111 = fptosi <4 x float> poison to <4 x i1>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r112 = fptoui <4 x float> poison to <4 x i8>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r113 = fptosi <4 x float> poison to <4 x i8>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r114 = fptoui <4 x float> poison to <4 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r115 = fptosi <4 x float> poison to <4 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r116 = fptoui <4 x float> poison to <4 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r117 = fptosi <4 x float> poison to <4 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r118 = fptoui <4 x float> poison to <4 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r119 = fptosi <4 x float> poison to <4 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r120 = fptoui <4 x double> poison to <4 x i1>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r121 = fptosi <4 x double> poison to <4 x i1>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r122 = fptoui <4 x double> poison to <4 x i8>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r123 = fptosi <4 x double> poison to <4 x i8>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r124 = fptoui <4 x double> poison to <4 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r125 = fptosi <4 x double> poison to <4 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r126 = fptoui <4 x double> poison to <4 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r127 = fptosi <4 x double> poison to <4 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r128 = fptoui <4 x double> poison to <4 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r129 = fptosi <4 x double> poison to <4 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r130 = fptoui <8 x float> poison to <8 x i1>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r131 = fptosi <8 x float> poison to <8 x i1>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r132 = fptoui <8 x float> poison to <8 x i8>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r133 = fptosi <8 x float> poison to <8 x i8>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r134 = fptoui <8 x float> poison to <8 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r135 = fptosi <8 x float> poison to <8 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r136 = fptoui <8 x float> poison to <8 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r137 = fptosi <8 x float> poison to <8 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r138 = fptoui <8 x float> poison to <8 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r139 = fptosi <8 x float> poison to <8 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r140 = fptoui <8 x double> poison to <8 x i1>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r141 = fptosi <8 x double> poison to <8 x i1>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r142 = fptoui <8 x double> poison to <8 x i8>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r143 = fptosi <8 x double> poison to <8 x i8>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r144 = fptoui <8 x double> poison to <8 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r145 = fptosi <8 x double> poison to <8 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r146 = fptoui <8 x double> poison to <8 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r147 = fptosi <8 x double> poison to <8 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r148 = fptoui <8 x double> poison to <8 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r149 = fptosi <8 x double> poison to <8 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r150 = fptoui <16 x float> poison to <16 x i1>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r151 = fptosi <16 x float> poison to <16 x i1>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r152 = fptoui <16 x float> poison to <16 x i8>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r153 = fptosi <16 x float> poison to <16 x i8>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r154 = fptoui <16 x float> poison to <16 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r155 = fptosi <16 x float> poison to <16 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r156 = fptoui <16 x float> poison to <16 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r157 = fptosi <16 x float> poison to <16 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r158 = fptoui <16 x float> poison to <16 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r159 = fptosi <16 x float> poison to <16 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r160 = fptoui <16 x double> poison to <16 x i1>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r161 = fptosi <16 x double> poison to <16 x i1>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r162 = fptoui <16 x double> poison to <16 x i8>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r163 = fptosi <16 x double> poison to <16 x i8>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r164 = fptoui <16 x double> poison to <16 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r165 = fptosi <16 x double> poison to <16 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r166 = fptoui <16 x double> poison to <16 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r167 = fptosi <16 x double> poison to <16 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r168 = fptoui <16 x double> poison to <16 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r169 = fptosi <16 x double> poison to <16 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r170 = uitofp <2 x i1> poison to <2 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r171 = sitofp <2 x i1> poison to <2 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r172 = uitofp <2 x i8> poison to <2 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r173 = sitofp <2 x i8> poison to <2 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r174 = uitofp <2 x i16> poison to <2 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r175 = sitofp <2 x i16> poison to <2 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r176 = uitofp <2 x i32> poison to <2 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r177 = sitofp <2 x i32> poison to <2 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r178 = uitofp <2 x i64> poison to <2 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r179 = sitofp <2 x i64> poison to <2 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r180 = uitofp <2 x i1> poison to <2 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r181 = sitofp <2 x i1> poison to <2 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r182 = uitofp <2 x i8> poison to <2 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r183 = sitofp <2 x i8> poison to <2 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r184 = uitofp <2 x i16> poison to <2 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r185 = sitofp <2 x i16> poison to <2 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r186 = uitofp <2 x i32> poison to <2 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r187 = sitofp <2 x i32> poison to <2 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r188 = uitofp <2 x i64> poison to <2 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r189 = sitofp <2 x i64> poison to <2 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r190 = uitofp <4 x i1> poison to <4 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r191 = sitofp <4 x i1> poison to <4 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r192 = uitofp <4 x i8> poison to <4 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r193 = sitofp <4 x i8> poison to <4 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r194 = uitofp <4 x i16> poison to <4 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r195 = sitofp <4 x i16> poison to <4 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r196 = uitofp <4 x i32> poison to <4 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r197 = sitofp <4 x i32> poison to <4 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r198 = uitofp <4 x i64> poison to <4 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r199 = sitofp <4 x i64> poison to <4 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r200 = uitofp <4 x i1> poison to <4 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r201 = sitofp <4 x i1> poison to <4 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r202 = uitofp <4 x i8> poison to <4 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r203 = sitofp <4 x i8> poison to <4 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r204 = uitofp <4 x i16> poison to <4 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r205 = sitofp <4 x i16> poison to <4 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r206 = uitofp <4 x i32> poison to <4 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r207 = sitofp <4 x i32> poison to <4 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r208 = uitofp <4 x i64> poison to <4 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r209 = sitofp <4 x i64> poison to <4 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r210 = uitofp <8 x i1> poison to <8 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r211 = sitofp <8 x i1> poison to <8 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r212 = uitofp <8 x i8> poison to <8 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r213 = sitofp <8 x i8> poison to <8 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r214 = uitofp <8 x i16> poison to <8 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r215 = sitofp <8 x i16> poison to <8 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r216 = uitofp <8 x i32> poison to <8 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r217 = sitofp <8 x i32> poison to <8 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r218 = uitofp <8 x i64> poison to <8 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r219 = sitofp <8 x i64> poison to <8 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r220 = uitofp <8 x i1> poison to <8 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r221 = sitofp <8 x i1> poison to <8 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r222 = uitofp <8 x i8> poison to <8 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r223 = sitofp <8 x i8> poison to <8 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r224 = uitofp <8 x i16> poison to <8 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r225 = sitofp <8 x i16> poison to <8 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r226 = uitofp <8 x i32> poison to <8 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r227 = sitofp <8 x i32> poison to <8 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r228 = uitofp <8 x i64> poison to <8 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r229 = sitofp <8 x i64> poison to <8 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r230 = uitofp <16 x i1> poison to <16 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r231 = sitofp <16 x i1> poison to <16 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r232 = uitofp <16 x i8> poison to <16 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r233 = sitofp <16 x i8> poison to <16 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r234 = uitofp <16 x i16> poison to <16 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r235 = sitofp <16 x i16> poison to <16 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r236 = uitofp <16 x i32> poison to <16 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r237 = sitofp <16 x i32> poison to <16 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r238 = uitofp <16 x i64> poison to <16 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r239 = sitofp <16 x i64> poison to <16 x float>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r240 = uitofp <16 x i1> poison to <16 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r241 = sitofp <16 x i1> poison to <16 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r242 = uitofp <16 x i8> poison to <16 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r243 = sitofp <16 x i8> poison to <16 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r244 = uitofp <16 x i16> poison to <16 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r245 = sitofp <16 x i16> poison to <16 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r246 = uitofp <16 x i32> poison to <16 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r247 = sitofp <16 x i32> poison to <16 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r248 = uitofp <16 x i64> poison to <16 x double>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r249 = sitofp <16 x i64> poison to <16 x double>
; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
;
- %r30 = fptoui float undef to i1
- %r31 = fptosi float undef to i1
- %r32 = fptoui float undef to i8
- %r33 = fptosi float undef to i8
- %r34 = fptoui float undef to i16
- %r35 = fptosi float undef to i16
- %r36 = fptoui float undef to i32
- %r37 = fptosi float undef to i32
- %r38 = fptoui float undef to i64
- %r39 = fptosi float undef to i64
- %r40 = fptoui double undef to i1
- %r41 = fptosi double undef to i1
- %r42 = fptoui double undef to i8
- %r43 = fptosi double undef to i8
- %r44 = fptoui double undef to i16
- %r45 = fptosi double undef to i16
- %r46 = fptoui double undef to i32
- %r47 = fptosi double undef to i32
- %r48 = fptoui double undef to i64
- %r49 = fptosi double undef to i64
- %r50 = sitofp i1 undef to float
- %r51 = uitofp i1 undef to float
- %r52 = sitofp i1 undef to double
- %r53 = uitofp i1 undef to double
- %r54 = sitofp i8 undef to float
- %r55 = uitofp i8 undef to float
- %r56 = sitofp i8 undef to double
- %r57 = uitofp i8 undef to double
- %r58 = sitofp i16 undef to float
- %r59 = uitofp i16 undef to float
- %r60 = sitofp i16 undef to double
- %r61 = uitofp i16 undef to double
- %r62 = sitofp i32 undef to float
- %r63 = uitofp i32 undef to float
- %r64 = sitofp i32 undef to double
- %r65 = uitofp i32 undef to double
- %r66 = sitofp i64 undef to float
- %r67 = uitofp i64 undef to float
- %r68 = sitofp i64 undef to double
- %r69 = uitofp i64 undef to double
- %r80 = fptrunc double undef to float
- %r81 = fptrunc <2 x double> undef to <2 x float>
- %r82 = fptrunc <4 x double> undef to <4 x float>
- %r83 = fptrunc <8 x double> undef to <8 x float>
- %r84 = fptrunc <16 x double> undef to <16 x float>
- %r85 = fpext float undef to double
- %r86 = fpext <2 x float> undef to <2 x double>
- %r87 = fpext <4 x float> undef to <4 x double>
- %r88 = fpext <8 x float> undef to <8 x double>
- %r89 = fpext <16 x float> undef to <16 x double>
- %r90 = fptoui <2 x float> undef to <2 x i1>
- %r91 = fptosi <2 x float> undef to <2 x i1>
- %r92 = fptoui <2 x float> undef to <2 x i8>
- %r93 = fptosi <2 x float> undef to <2 x i8>
- %r94 = fptoui <2 x float> undef to <2 x i16>
- %r95 = fptosi <2 x float> undef to <2 x i16>
- %r96 = fptoui <2 x float> undef to <2 x i32>
- %r97 = fptosi <2 x float> undef to <2 x i32>
- %r98 = fptoui <2 x float> undef to <2 x i64>
- %r99 = fptosi <2 x float> undef to <2 x i64>
- %r100 = fptoui <2 x double> undef to <2 x i1>
- %r101 = fptosi <2 x double> undef to <2 x i1>
- %r102 = fptoui <2 x double> undef to <2 x i8>
- %r103 = fptosi <2 x double> undef to <2 x i8>
- %r104 = fptoui <2 x double> undef to <2 x i16>
- %r105 = fptosi <2 x double> undef to <2 x i16>
- %r106 = fptoui <2 x double> undef to <2 x i32>
- %r107 = fptosi <2 x double> undef to <2 x i32>
- %r108 = fptoui <2 x double> undef to <2 x i64>
- %r109 = fptosi <2 x double> undef to <2 x i64>
+ %r30 = fptoui float poison to i1
+ %r31 = fptosi float poison to i1
+ %r32 = fptoui float poison to i8
+ %r33 = fptosi float poison to i8
+ %r34 = fptoui float poison to i16
+ %r35 = fptosi float poison to i16
+ %r36 = fptoui float poison to i32
+ %r37 = fptosi float poison to i32
+ %r38 = fptoui float poison to i64
+ %r39 = fptosi float poison to i64
+ %r40 = fptoui double poison to i1
+ %r41 = fptosi double poison to i1
+ %r42 = fptoui double poison to i8
+ %r43 = fptosi double poison to i8
+ %r44 = fptoui double poison to i16
+ %r45 = fptosi double poison to i16
+ %r46 = fptoui double poison to i32
+ %r47 = fptosi double poison to i32
+ %r48 = fptoui double poison to i64
+ %r49 = fptosi double poison to i64
+ %r50 = sitofp i1 poison to float
+ %r51 = uitofp i1 poison to float
+ %r52 = sitofp i1 poison to double
+ %r53 = uitofp i1 poison to double
+ %r54 = sitofp i8 poison to float
+ %r55 = uitofp i8 poison to float
+ %r56 = sitofp i8 poison to double
+ %r57 = uitofp i8 poison to double
+ %r58 = sitofp i16 poison to float
+ %r59 = uitofp i16 poison to float
+ %r60 = sitofp i16 poison to double
+ %r61 = uitofp i16 poison to double
+ %r62 = sitofp i32 poison to float
+ %r63 = uitofp i32 poison to float
+ %r64 = sitofp i32 poison to double
+ %r65 = uitofp i32 poison to double
+ %r66 = sitofp i64 poison to float
+ %r67 = uitofp i64 poison to float
+ %r68 = sitofp i64 poison to double
+ %r69 = uitofp i64 poison to double
+ %r80 = fptrunc double poison to float
+ %r81 = fptrunc <2 x double> poison to <2 x float>
+ %r82 = fptrunc <4 x double> poison to <4 x float>
+ %r83 = fptrunc <8 x double> poison to <8 x float>
+ %r84 = fptrunc <16 x double> poison to <16 x float>
+ %r85 = fpext float poison to double
+ %r86 = fpext <2 x float> poison to <2 x double>
+ %r87 = fpext <4 x float> poison to <4 x double>
+ %r88 = fpext <8 x float> poison to <8 x double>
+ %r89 = fpext <16 x float> poison to <16 x double>
+ %r90 = fptoui <2 x float> poison to <2 x i1>
+ %r91 = fptosi <2 x float> poison to <2 x i1>
+ %r92 = fptoui <2 x float> poison to <2 x i8>
+ %r93 = fptosi <2 x float> poison to <2 x i8>
+ %r94 = fptoui <2 x float> poison to <2 x i16>
+ %r95 = fptosi <2 x float> poison to <2 x i16>
+ %r96 = fptoui <2 x float> poison to <2 x i32>
+ %r97 = fptosi <2 x float> poison to <2 x i32>
+ %r98 = fptoui <2 x float> poison to <2 x i64>
+ %r99 = fptosi <2 x float> poison to <2 x i64>
+ %r100 = fptoui <2 x double> poison to <2 x i1>
+ %r101 = fptosi <2 x double> poison to <2 x i1>
+ %r102 = fptoui <2 x double> poison to <2 x i8>
+ %r103 = fptosi <2 x double> poison to <2 x i8>
+ %r104 = fptoui <2 x double> poison to <2 x i16>
+ %r105 = fptosi <2 x double> poison to <2 x i16>
+ %r106 = fptoui <2 x double> poison to <2 x i32>
+ %r107 = fptosi <2 x double> poison to <2 x i32>
+ %r108 = fptoui <2 x double> poison to <2 x i64>
+ %r109 = fptosi <2 x double> poison to <2 x i64>
- %r110 = fptoui <4 x float> undef to <4 x i1>
- %r111 = fptosi <4 x float> undef to <4 x i1>
- %r112 = fptoui <4 x float> undef to <4 x i8>
- %r113 = fptosi <4 x float> undef to <4 x i8>
- %r114 = fptoui <4 x float> undef to <4 x i16>
- %r115 = fptosi <4 x float> undef to <4 x i16>
- %r116 = fptoui <4 x float> undef to <4 x i32>
- %r117 = fptosi <4 x float> undef to <4 x i32>
- %r118 = fptoui <4 x float> undef to <4 x i64>
- %r119 = fptosi <4 x float> undef to <4 x i64>
+ %r110 = fptoui <4 x float> poison to <4 x i1>
+ %r111 = fptosi <4 x float> poison to <4 x i1>
+ %r112 = fptoui <4 x float> poison to <4 x i8>
+ %r113 = fptosi <4 x float> poison to <4 x i8>
+ %r114 = fptoui <4 x float> poison to <4 x i16>
+ %r115 = fptosi <4 x float> poison to <4 x i16>
+ %r116 = fptoui <4 x float> poison to <4 x i32>
+ %r117 = fptosi <4 x float> poison to <4 x i32>
+ %r118 = fptoui <4 x float> poison to <4 x i64>
+ %r119 = fptosi <4 x float> poison to <4 x i64>
- %r120 = fptoui <4 x double> undef to <4 x i1>
- %r121 = fptosi <4 x double> undef to <4 x i1>
- %r122 = fptoui <4 x double> undef to <4 x i8>
- %r123 = fptosi <4 x double> undef to <4 x i8>
- %r124 = fptoui <4 x double> undef to <4 x i16>
- %r125 = fptosi <4 x double> undef to <4 x i16>
- %r126 = fptoui <4 x double> undef to <4 x i32>
- %r127 = fptosi <4 x double> undef to <4 x i32>
- %r128 = fptoui <4 x double> undef to <4 x i64>
- %r129 = fptosi <4 x double> undef to <4 x i64>
+ %r120 = fptoui <4 x double> poison to <4 x i1>
+ %r121 = fptosi <4 x double> poison to <4 x i1>
+ %r122 = fptoui <4 x double> poison to <4 x i8>
+ %r123 = fptosi <4 x double> poison to <4 x i8>
+ %r124 = fptoui <4 x double> poison to <4 x i16>
+ %r125 = fptosi <4 x double> poison to <4 x i16>
+ %r126 = fptoui <4 x double> poison to <4 x i32>
+ %r127 = fptosi <4 x double> poison to <4 x i32>
+ %r128 = fptoui <4 x double> poison to <4 x i64>
+ %r129 = fptosi <4 x double> poison to <4 x i64>
- %r130 = fptoui <8 x float> undef to <8 x i1>
- %r131 = fptosi <8 x float> undef to <8 x i1>
- %r132 = fptoui <8 x float> undef to <8 x i8>
- %r133 = fptosi <8 x float> undef to <8 x i8>
- %r134 = fptoui <8 x float> undef to <8 x i16>
- %r135 = fptosi <8 x float> undef to <8 x i16>
- %r136 = fptoui <8 x float> undef to <8 x i32>
- %r137 = fptosi <8 x float> undef to <8 x i32>
- %r138 = fptoui <8 x float> undef to <8 x i64>
- %r139 = fptosi <8 x float> undef to <8 x i64>
+ %r130 = fptoui <8 x float> poison to <8 x i1>
+ %r131 = fptosi <8 x float> poison to <8 x i1>
+ %r132 = fptoui <8 x float> poison to <8 x i8>
+ %r133 = fptosi <8 x float> poison to <8 x i8>
+ %r134 = fptoui <8 x float> poison to <8 x i16>
+ %r135 = fptosi <8 x float> poison to <8 x i16>
+ %r136 = fptoui <8 x float> poison to <8 x i32>
+ %r137 = fptosi <8 x float> poison to <8 x i32>
+ %r138 = fptoui <8 x float> poison to <8 x i64>
+ %r139 = fptosi <8 x float> poison to <8 x i64>
- %r140 = fptoui <8 x double> undef to <8 x i1>
- %r141 = fptosi <8 x double> undef to <8 x i1>
- %r142 = fptoui <8 x double> undef to <8 x i8>
- %r143 = fptosi <8 x double> undef to <8 x i8>
- %r144 = fptoui <8 x double> undef to <8 x i16>
- %r145 = fptosi <8 x double> undef to <8 x i16>
- %r146 = fptoui <8 x double> undef to <8 x i32>
- %r147 = fptosi <8 x double> undef to <8 x i32>
- %r148 = fptoui <8 x double> undef to <8 x i64>
- %r149 = fptosi <8 x double> undef to <8 x i64>
+ %r140 = fptoui <8 x double> poison to <8 x i1>
+ %r141 = fptosi <8 x double> poison to <8 x i1>
+ %r142 = fptoui <8 x double> poison to <8 x i8>
+ %r143 = fptosi <8 x double> poison to <8 x i8>
+ %r144 = fptoui <8 x double> poison to <8 x i16>
+ %r145 = fptosi <8 x double> poison to <8 x i16>
+ %r146 = fptoui <8 x double> poison to <8 x i32>
+ %r147 = fptosi <8 x double> poison to <8 x i32>
+ %r148 = fptoui <8 x double> poison to <8 x i64>
+ %r149 = fptosi <8 x double> poison to <8 x i64>
- %r150 = fptoui <16 x float> undef to <16 x i1>
- %r151 = fptosi <16 x float> undef to <16 x i1>
- %r152 = fptoui <16 x float> undef to <16 x i8>
- %r153 = fptosi <16 x float> undef to <16 x i8>
- %r154 = fptoui <16 x float> undef to <16 x i16>
- %r155 = fptosi <16 x float> undef to <16 x i16>
- %r156 = fptoui <16 x float> undef to <16 x i32>
- %r157 = fptosi <16 x float> undef to <16 x i32>
- %r158 = fptoui <16 x float> undef to <16 x i64>
- %r159 = fptosi <16 x float> undef to <16 x i64>
+ %r150 = fptoui <16 x float> poison to <16 x i1>
+ %r151 = fptosi <16 x float> poison to <16 x i1>
+ %r152 = fptoui <16 x float> poison to <16 x i8>
+ %r153 = fptosi <16 x float> poison to <16 x i8>
+ %r154 = fptoui <16 x float> poison to <16 x i16>
+ %r155 = fptosi <16 x float> poison to <16 x i16>
+ %r156 = fptoui <16 x float> poison to <16 x i32>
+ %r157 = fptosi <16 x float> poison to <16 x i32>
+ %r158 = fptoui <16 x float> poison to <16 x i64>
+ %r159 = fptosi <16 x float> poison to <16 x i64>
- %r160 = fptoui <16 x double> undef to <16 x i1>
- %r161 = fptosi <16 x double> undef to <16 x i1>
- %r162 = fptoui <16 x double> undef to <16 x i8>
- %r163 = fptosi <16 x double> undef to <16 x i8>
- %r164 = fptoui <16 x double> undef to <16 x i16>
- %r165 = fptosi <16 x double> undef to <16 x i16>
- %r166 = fptoui <16 x double> undef to <16 x i32>
- %r167 = fptosi <16 x double> undef to <16 x i32>
- %r168 = fptoui <16 x double> undef to <16 x i64>
- %r169 = fptosi <16 x double> undef to <16 x i64>
+ %r160 = fptoui <16 x double> poison to <16 x i1>
+ %r161 = fptosi <16 x double> poison to <16 x i1>
+ %r162 = fptoui <16 x double> poison to <16 x i8>
+ %r163 = fptosi <16 x double> poison to <16 x i8>
+ %r164 = fptoui <16 x double> poison to <16 x i16>
+ %r165 = fptosi <16 x double> poison to <16 x i16>
+ %r166 = fptoui <16 x double> poison to <16 x i32>
+ %r167 = fptosi <16 x double> poison to <16 x i32>
+ %r168 = fptoui <16 x double> poison to <16 x i64>
+ %r169 = fptosi <16 x double> poison to <16 x i64>
- %r170 = uitofp <2 x i1> undef to <2 x float>
- %r171 = sitofp <2 x i1> undef to <2 x float>
- %r172 = uitofp <2 x i8> undef to <2 x float>
- %r173 = sitofp <2 x i8> undef to <2 x float>
- %r174 = uitofp <2 x i16> undef to <2 x float>
- %r175 = sitofp <2 x i16> undef to <2 x float>
- %r176 = uitofp <2 x i32> undef to <2 x float>
- %r177 = sitofp <2 x i32> undef to <2 x float>
- %r178 = uitofp <2 x i64> undef to <2 x float>
- %r179 = sitofp <2 x i64> undef to <2 x float>
+ %r170 = uitofp <2 x i1> poison to <2 x float>
+ %r171 = sitofp <2 x i1> poison to <2 x float>
+ %r172 = uitofp <2 x i8> poison to <2 x float>
+ %r173 = sitofp <2 x i8> poison to <2 x float>
+ %r174 = uitofp <2 x i16> poison to <2 x float>
+ %r175 = sitofp <2 x i16> poison to <2 x float>
+ %r176 = uitofp <2 x i32> poison to <2 x float>
+ %r177 = sitofp <2 x i32> poison to <2 x float>
+ %r178 = uitofp <2 x i64> poison to <2 x float>
+ %r179 = sitofp <2 x i64> poison to <2 x float>
- %r180 = uitofp <2 x i1> undef to <2 x double>
- %r181 = sitofp <2 x i1> undef to <2 x double>
- %r182 = uitofp <2 x i8> undef to <2 x double>
- %r183 = sitofp <2 x i8> undef to <2 x double>
- %r184 = uitofp <2 x i16> undef to <2 x double>
- %r185 = sitofp <2 x i16> undef to <2 x double>
- %r186 = uitofp <2 x i32> undef to <2 x double>
- %r187 = sitofp <2 x i32> undef to <2 x double>
- %r188 = uitofp <2 x i64> undef to <2 x double>
- %r189 = sitofp <2 x i64> undef to <2 x double>
+ %r180 = uitofp <2 x i1> poison to <2 x double>
+ %r181 = sitofp <2 x i1> poison to <2 x double>
+ %r182 = uitofp <2 x i8> poison to <2 x double>
+ %r183 = sitofp <2 x i8> poison to <2 x double>
+ %r184 = uitofp <2 x i16> poison to <2 x double>
+ %r185 = sitofp <2 x i16> poison to <2 x double>
+ %r186 = uitofp <2 x i32> poison to <2 x double>
+ %r187 = sitofp <2 x i32> poison to <2 x double>
+ %r188 = uitofp <2 x i64> poison to <2 x double>
+ %r189 = sitofp <2 x i64> poison to <2 x double>
- %r190 = uitofp <4 x i1> undef to <4 x float>
- %r191 = sitofp <4 x i1> undef to <4 x float>
- %r192 = uitofp <4 x i8> undef to <4 x float>
- %r193 = sitofp <4 x i8> undef to <4 x float>
- %r194 = uitofp <4 x i16> undef to <4 x float>
- %r195 = sitofp <4 x i16> undef to <4 x float>
- %r196 = uitofp <4 x i32> undef to <4 x float>
- %r197 = sitofp <4 x i32> undef to <4 x float>
- %r198 = uitofp <4 x i64> undef to <4 x float>
- %r199 = sitofp <4 x i64> undef to <4 x float>
+ %r190 = uitofp <4 x i1> poison to <4 x float>
+ %r191 = sitofp <4 x i1> poison to <4 x float>
+ %r192 = uitofp <4 x i8> poison to <4 x float>
+ %r193 = sitofp <4 x i8> poison to <4 x float>
+ %r194 = uitofp <4 x i16> poison to <4 x float>
+ %r195 = sitofp <4 x i16> poison to <4 x float>
+ %r196 = uitofp <4 x i32> poison to <4 x float>
+ %r197 = sitofp <4 x i32> poison to <4 x float>
+ %r198 = uitofp <4 x i64> poison to <4 x float>
+ %r199 = sitofp <4 x i64> poison to <4 x float>
- %r200 = uitofp <4 x i1> undef to <4 x double>
- %r201 = sitofp <4 x i1> undef to <4 x double>
- %r202 = uitofp <4 x i8> undef to <4 x double>
- %r203 = sitofp <4 x i8> undef to <4 x double>
- %r204 = uitofp <4 x i16> undef to <4 x double>
- %r205 = sitofp <4 x i16> undef to <4 x double>
- %r206 = uitofp <4 x i32> undef to <4 x double>
- %r207 = sitofp <4 x i32> undef to <4 x double>
- %r208 = uitofp <4 x i64> undef to <4 x double>
- %r209 = sitofp <4 x i64> undef to <4 x double>
+ %r200 = uitofp <4 x i1> poison to <4 x double>
+ %r201 = sitofp <4 x i1> poison to <4 x double>
+ %r202 = uitofp <4 x i8> poison to <4 x double>
+ %r203 = sitofp <4 x i8> poison to <4 x double>
+ %r204 = uitofp <4 x i16> poison to <4 x double>
+ %r205 = sitofp <4 x i16> poison to <4 x double>
+ %r206 = uitofp <4 x i32> poison to <4 x double>
+ %r207 = sitofp <4 x i32> poison to <4 x double>
+ %r208 = uitofp <4 x i64> poison to <4 x double>
+ %r209 = sitofp <4 x i64> poison to <4 x double>
- %r210 = uitofp <8 x i1> undef to <8 x float>
- %r211 = sitofp <8 x i1> undef to <8 x float>
- %r212 = uitofp <8 x i8> undef to <8 x float>
- %r213 = sitofp <8 x i8> undef to <8 x float>
- %r214 = uitofp <8 x i16> undef to <8 x float>
- %r215 = sitofp <8 x i16> undef to <8 x float>
- %r216 = uitofp <8 x i32> undef to <8 x float>
- %r217 = sitofp <8 x i32> undef to <8 x float>
- %r218 = uitofp <8 x i64> undef to <8 x float>
- %r219 = sitofp <8 x i64> undef to <8 x float>
+ %r210 = uitofp <8 x i1> poison to <8 x float>
+ %r211 = sitofp <8 x i1> poison to <8 x float>
+ %r212 = uitofp <8 x i8> poison to <8 x float>
+ %r213 = sitofp <8 x i8> poison to <8 x float>
+ %r214 = uitofp <8 x i16> poison to <8 x float>
+ %r215 = sitofp <8 x i16> poison to <8 x float>
+ %r216 = uitofp <8 x i32> poison to <8 x float>
+ %r217 = sitofp <8 x i32> poison to <8 x float>
+ %r218 = uitofp <8 x i64> poison to <8 x float>
+ %r219 = sitofp <8 x i64> poison to <8 x float>
- %r220 = uitofp <8 x i1> undef to <8 x double>
- %r221 = sitofp <8 x i1> undef to <8 x double>
- %r222 = uitofp <8 x i8> undef to <8 x double>
- %r223 = sitofp <8 x i8> undef to <8 x double>
- %r224 = uitofp <8 x i16> undef to <8 x double>
- %r225 = sitofp <8 x i16> undef to <8 x double>
- %r226 = uitofp <8 x i32> undef to <8 x double>
- %r227 = sitofp <8 x i32> undef to <8 x double>
- %r228 = uitofp <8 x i64> undef to <8 x double>
- %r229 = sitofp <8 x i64> undef to <8 x double>
+ %r220 = uitofp <8 x i1> poison to <8 x double>
+ %r221 = sitofp <8 x i1> poison to <8 x double>
+ %r222 = uitofp <8 x i8> poison to <8 x double>
+ %r223 = sitofp <8 x i8> poison to <8 x double>
+ %r224 = uitofp <8 x i16> poison to <8 x double>
+ %r225 = sitofp <8 x i16> poison to <8 x double>
+ %r226 = uitofp <8 x i32> poison to <8 x double>
+ %r227 = sitofp <8 x i32> poison to <8 x double>
+ %r228 = uitofp <8 x i64> poison to <8 x double>
+ %r229 = sitofp <8 x i64> poison to <8 x double>
- %r230 = uitofp <16 x i1> undef to <16 x float>
- %r231 = sitofp <16 x i1> undef to <16 x float>
- %r232 = uitofp <16 x i8> undef to <16 x float>
- %r233 = sitofp <16 x i8> undef to <16 x float>
- %r234 = uitofp <16 x i16> undef to <16 x float>
- %r235 = sitofp <16 x i16> undef to <16 x float>
- %r236 = uitofp <16 x i32> undef to <16 x float>
- %r237 = sitofp <16 x i32> undef to <16 x float>
- %r238 = uitofp <16 x i64> undef to <16 x float>
- %r239 = sitofp <16 x i64> undef to <16 x float>
+ %r230 = uitofp <16 x i1> poison to <16 x float>
+ %r231 = sitofp <16 x i1> poison to <16 x float>
+ %r232 = uitofp <16 x i8> poison to <16 x float>
+ %r233 = sitofp <16 x i8> poison to <16 x float>
+ %r234 = uitofp <16 x i16> poison to <16 x float>
+ %r235 = sitofp <16 x i16> poison to <16 x float>
+ %r236 = uitofp <16 x i32> poison to <16 x float>
+ %r237 = sitofp <16 x i32> poison to <16 x float>
+ %r238 = uitofp <16 x i64> poison to <16 x float>
+ %r239 = sitofp <16 x i64> poison to <16 x float>
- %r240 = uitofp <16 x i1> undef to <16 x double>
- %r241 = sitofp <16 x i1> undef to <16 x double>
- %r242 = uitofp <16 x i8> undef to <16 x double>
- %r243 = sitofp <16 x i8> undef to <16 x double>
- %r244 = uitofp <16 x i16> undef to <16 x double>
- %r245 = sitofp <16 x i16> undef to <16 x double>
- %r246 = uitofp <16 x i32> undef to <16 x double>
- %r247 = sitofp <16 x i32> undef to <16 x double>
- %r248 = uitofp <16 x i64> undef to <16 x double>
- %r249 = sitofp <16 x i64> undef to <16 x double>
+ %r240 = uitofp <16 x i1> poison to <16 x double>
+ %r241 = sitofp <16 x i1> poison to <16 x double>
+ %r242 = uitofp <16 x i8> poison to <16 x double>
+ %r243 = sitofp <16 x i8> poison to <16 x double>
+ %r244 = uitofp <16 x i16> poison to <16 x double>
+ %r245 = sitofp <16 x i16> poison to <16 x double>
+ %r246 = uitofp <16 x i32> poison to <16 x double>
+ %r247 = sitofp <16 x i32> poison to <16 x double>
+ %r248 = uitofp <16 x i64> poison to <16 x double>
+ %r249 = sitofp <16 x i64> poison to <16 x double>
ret i32 undef
}
@@ -1760,24 +1760,24 @@ define i32 @casts_with_users(i8 %a, i16 %b, i32 %c, i64 %d, i1 %e) {
define i32 @bitcasts() {
; CHECK-LABEL: 'bitcasts'
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %a = bitcast i32 undef to i32
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %b = bitcast float undef to float
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %c = bitcast i32 undef to float
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %d = bitcast float undef to i32
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %e = bitcast i64 undef to double
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %f = bitcast double undef to i64
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %g = bitcast half undef to i16
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %h = bitcast i16 undef to half
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %a = bitcast i32 poison to i32
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %b = bitcast float poison to float
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %c = bitcast i32 poison to float
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %d = bitcast float poison to i32
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %e = bitcast i64 poison to double
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %f = bitcast double poison to i64
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %g = bitcast half poison to i16
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %h = bitcast i16 poison to half
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
;
- %a = bitcast i32 undef to i32
- %b = bitcast float undef to float
- %c = bitcast i32 undef to float
- %d = bitcast float undef to i32
- %e = bitcast i64 undef to double
- %f = bitcast double undef to i64
- %g = bitcast half undef to i16
- %h = bitcast i16 undef to half
+ %a = bitcast i32 poison to i32
+ %b = bitcast float poison to float
+ %c = bitcast i32 poison to float
+ %d = bitcast float poison to i32
+ %e = bitcast i64 poison to double
+ %f = bitcast double poison to i64
+ %g = bitcast half poison to i16
+ %h = bitcast i16 poison to half
ret i32 undef
}
@@ -2012,31 +2012,31 @@ define i32 @load_extends() #0 {
define i32 @store_truncs() {
; CHECK-LABEL: 'store_truncs'
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r0 = trunc i64 undef to i8
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r0 = trunc i64 poison to i8
; CHECK-NEXT: Cost Model: Found costs of 1 for: store i8 %r0, ptr undef, align 1
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r1 = trunc i64 undef to i16
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r1 = trunc i64 poison to i16
; CHECK-NEXT: Cost Model: Found costs of 1 for: store i16 %r1, ptr undef, align 2
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r2 = trunc i64 undef to i32
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r2 = trunc i64 poison to i32
; CHECK-NEXT: Cost Model: Found costs of 1 for: store i32 %r2, ptr undef, align 4
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r3 = trunc i32 undef to i8
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r3 = trunc i32 poison to i8
; CHECK-NEXT: Cost Model: Found costs of 1 for: store i8 %r3, ptr undef, align 1
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r4 = trunc i32 undef to i16
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r4 = trunc i32 poison to i16
; CHECK-NEXT: Cost Model: Found costs of 1 for: store i16 %r4, ptr undef, align 2
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %r5 = trunc i16 undef to i8
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %r5 = trunc i16 poison to i8
; CHECK-NEXT: Cost Model: Found costs of 1 for: store i8 %r5, ptr undef, align 1
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
;
- %r0 = trunc i64 undef to i8
+ %r0 = trunc i64 poison to i8
store i8 %r0, ptr undef
- %r1 = trunc i64 undef to i16
+ %r1 = trunc i64 poison to i16
store i16 %r1, ptr undef
- %r2 = trunc i64 undef to i32
+ %r2 = trunc i64 poison to i32
store i32 %r2, ptr undef
- %r3 = trunc i32 undef to i8
+ %r3 = trunc i32 poison to i8
store i8 %r3, ptr undef
- %r4 = trunc i32 undef to i16
+ %r4 = trunc i32 poison to i16
store i16 %r4, ptr undef
- %r5 = trunc i16 undef to i8
+ %r5 = trunc i16 poison to i8
store i8 %r5, ptr undef
ret i32 undef
}
@@ -2084,372 +2084,372 @@ declare void @use(i16, i16, i32, i32, i64, i64, i32, i32, i64, i64, i64, i64)
define void @fp16cast() {
; CHECK-SVE-LABEL: 'fp16cast'
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r30 = fptoui half undef to i1
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r31 = fptosi half undef to i1
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r32 = fptoui half undef to i8
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r33 = fptosi half undef to i8
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r34 = fptoui half undef to i16
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r35 = fptosi half undef to i16
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r36 = fptoui half undef to i32
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r37 = fptosi half undef to i32
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r38 = fptoui half undef to i64
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r39 = fptosi half undef to i64
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r90 = fptoui <2 x half> undef to <2 x i1>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r91 = fptosi <2 x half> undef to <2 x i1>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r92 = fptoui <2 x half> undef to <2 x i8>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r93 = fptosi <2 x half> undef to <2 x i8>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r94 = fptoui <2 x half> undef to <2 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r95 = fptosi <2 x half> undef to <2 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r96 = fptoui <2 x half> undef to <2 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r97 = fptosi <2 x half> undef to <2 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x half> undef to <2 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x half> undef to <2 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r110 = fptoui <4 x half> undef to <4 x i1>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r111 = fptosi <4 x half> undef to <4 x i1>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r112 = fptoui <4 x half> undef to <4 x i8>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r113 = fptosi <4 x half> undef to <4 x i8>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r114 = fptoui <4 x half> undef to <4 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r115 = fptosi <4 x half> undef to <4 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r116 = fptoui <4 x half> undef to <4 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r117 = fptosi <4 x half> undef to <4 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r118 = fptoui <4 x half> undef to <4 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r119 = fptosi <4 x half> undef to <4 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r130 = fptoui <8 x half> undef to <8 x i1>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r131 = fptosi <8 x half> undef to <8 x i1>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r132 = fptoui <8 x half> undef to <8 x i8>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r133 = fptosi <8 x half> undef to <8 x i8>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r134 = fptoui <8 x half> undef to <8 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r135 = fptosi <8 x half> undef to <8 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r136 = fptoui <8 x half> undef to <8 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r137 = fptosi <8 x half> undef to <8 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x half> undef to <8 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x half> undef to <8 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:81 CodeSize:1 Lat:1 SizeLat:1 for: %r150 = fptoui <16 x half> undef to <16 x i1>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:81 CodeSize:1 Lat:1 SizeLat:1 for: %r151 = fptosi <16 x half> undef to <16 x i1>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r152 = fptoui <16 x half> undef to <16 x i8>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r153 = fptosi <16 x half> undef to <16 x i8>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r154 = fptoui <16 x half> undef to <16 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r155 = fptosi <16 x half> undef to <16 x i16>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x half> undef to <16 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x half> undef to <16 x i32>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:86 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x half> undef to <16 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:86 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x half> undef to <16 x i64>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r250 = uitofp <8 x i1> undef to <8 x half>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r251 = sitofp <8 x i1> undef to <8 x half>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r252 = uitofp <8 x i8> undef to <8 x half>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r253 = sitofp <8 x i8> undef to <8 x half>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r254 = uitofp <8 x i16> undef to <8 x half>
-; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r255 = sitofp <8 x i16> undef to <8 x half>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r256 = uitofp <8 x i32> undef to <8 x half>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r257 = sitofp <8 x i32> undef to <8 x half>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r258 = uitofp <8 x i64> undef to <8 x half>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r259 = sitofp <8 x i64> undef to <8 x half>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r260 = uitofp <16 x i1> undef to <16 x half>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r261 = sitofp <16 x i1> undef to <16 x half>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r262 = uitofp <16 x i8> undef to <16 x half>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r263 = sitofp <16 x i8> undef to <16 x half>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r264 = uitofp <16 x i16> undef to <16 x half>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r265 = sitofp <16 x i16> undef to <16 x half>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %r266 = uitofp <16 x i32> undef to <16 x half>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %r267 = sitofp <16 x i32> undef to <16 x half>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %r268 = uitofp <16 x i64> undef to <16 x half>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %r269 = sitofp <16 x i64> undef to <16 x half>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r30 = fptoui half poison to i1
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r31 = fptosi half poison to i1
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r32 = fptoui half poison to i8
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r33 = fptosi half poison to i8
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r34 = fptoui half poison to i16
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r35 = fptosi half poison to i16
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r36 = fptoui half poison to i32
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r37 = fptosi half poison to i32
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r38 = fptoui half poison to i64
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r39 = fptosi half poison to i64
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r90 = fptoui <2 x half> poison to <2 x i1>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r91 = fptosi <2 x half> poison to <2 x i1>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r92 = fptoui <2 x half> poison to <2 x i8>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r93 = fptosi <2 x half> poison to <2 x i8>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r94 = fptoui <2 x half> poison to <2 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r95 = fptosi <2 x half> poison to <2 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r96 = fptoui <2 x half> poison to <2 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r97 = fptosi <2 x half> poison to <2 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x half> poison to <2 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x half> poison to <2 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r110 = fptoui <4 x half> poison to <4 x i1>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r111 = fptosi <4 x half> poison to <4 x i1>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r112 = fptoui <4 x half> poison to <4 x i8>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r113 = fptosi <4 x half> poison to <4 x i8>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r114 = fptoui <4 x half> poison to <4 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r115 = fptosi <4 x half> poison to <4 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r116 = fptoui <4 x half> poison to <4 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r117 = fptosi <4 x half> poison to <4 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r118 = fptoui <4 x half> poison to <4 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r119 = fptosi <4 x half> poison to <4 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r130 = fptoui <8 x half> poison to <8 x i1>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r131 = fptosi <8 x half> poison to <8 x i1>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r132 = fptoui <8 x half> poison to <8 x i8>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r133 = fptosi <8 x half> poison to <8 x i8>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r134 = fptoui <8 x half> poison to <8 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r135 = fptosi <8 x half> poison to <8 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r136 = fptoui <8 x half> poison to <8 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r137 = fptosi <8 x half> poison to <8 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x half> poison to <8 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x half> poison to <8 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:81 CodeSize:1 Lat:1 SizeLat:1 for: %r150 = fptoui <16 x half> poison to <16 x i1>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:81 CodeSize:1 Lat:1 SizeLat:1 for: %r151 = fptosi <16 x half> poison to <16 x i1>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r152 = fptoui <16 x half> poison to <16 x i8>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r153 = fptosi <16 x half> poison to <16 x i8>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r154 = fptoui <16 x half> poison to <16 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r155 = fptosi <16 x half> poison to <16 x i16>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x half> poison to <16 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x half> poison to <16 x i32>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:86 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x half> poison to <16 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:86 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x half> poison to <16 x i64>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r250 = uitofp <8 x i1> poison to <8 x half>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r251 = sitofp <8 x i1> poison to <8 x half>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r252 = uitofp <8 x i8> poison to <8 x half>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r253 = sitofp <8 x i8> poison to <8 x half>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r254 = uitofp <8 x i16> poison to <8 x half>
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r255 = sitofp <8 x i16> poison to <8 x half>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r256 = uitofp <8 x i32> poison to <8 x half>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r257 = sitofp <8 x i32> poison to <8 x half>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r258 = uitofp <8 x i64> poison to <8 x half>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r259 = sitofp <8 x i64> poison to <8 x half>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r260 = uitofp <16 x i1> poison to <16 x half>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r261 = sitofp <16 x i1> poison to <16 x half>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r262 = uitofp <16 x i8> poison to <16 x half>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r263 = sitofp <16 x i8> poison to <16 x half>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r264 = uitofp <16 x i16> poison to <16 x half>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r265 = sitofp <16 x i16> poison to <16 x half>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %r266 = uitofp <16 x i32> poison to <16 x half>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %r267 = sitofp <16 x i32> poison to <16 x half>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %r268 = uitofp <16 x i64> poison to <16 x half>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %r269 = sitofp <16 x i64> poison to <16 x half>
; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; SVE128-NO-NEON-LABEL: 'fp16cast'
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r30 = fptoui half undef to i1
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r31 = fptosi half undef to i1
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r32 = fptoui half undef to i8
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r33 = fptosi half undef to i8
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r34 = fptoui half undef to i16
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r35 = fptosi half undef to i16
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r36 = fptoui half undef to i32
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r37 = fptosi half undef to i32
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r38 = fptoui half undef to i64
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r39 = fptosi half undef to i64
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r90 = fptoui <2 x half> undef to <2 x i1>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r91 = fptosi <2 x half> undef to <2 x i1>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r92 = fptoui <2 x half> undef to <2 x i8>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r93 = fptosi <2 x half> undef to <2 x i8>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r94 = fptoui <2 x half> undef to <2 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r95 = fptosi <2 x half> undef to <2 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r96 = fptoui <2 x half> undef to <2 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r97 = fptosi <2 x half> undef to <2 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r98 = fptoui <2 x half> undef to <2 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r99 = fptosi <2 x half> undef to <2 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r110 = fptoui <4 x half> undef to <4 x i1>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r111 = fptosi <4 x half> undef to <4 x i1>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r112 = fptoui <4 x half> undef to <4 x i8>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r113 = fptosi <4 x half> undef to <4 x i8>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r114 = fptoui <4 x half> undef to <4 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r115 = fptosi <4 x half> undef to <4 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r116 = fptoui <4 x half> undef to <4 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r117 = fptosi <4 x half> undef to <4 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r118 = fptoui <4 x half> undef to <4 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r119 = fptosi <4 x half> undef to <4 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r130 = fptoui <8 x half> undef to <8 x i1>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r131 = fptosi <8 x half> undef to <8 x i1>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r132 = fptoui <8 x half> undef to <8 x i8>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r133 = fptosi <8 x half> undef to <8 x i8>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r134 = fptoui <8 x half> undef to <8 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r135 = fptosi <8 x half> undef to <8 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r136 = fptoui <8 x half> undef to <8 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r137 = fptosi <8 x half> undef to <8 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x half> undef to <8 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x half> undef to <8 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r150 = fptoui <16 x half> undef to <16 x i1>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r151 = fptosi <16 x half> undef to <16 x i1>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r152 = fptoui <16 x half> undef to <16 x i8>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r153 = fptosi <16 x half> undef to <16 x i8>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r154 = fptoui <16 x half> undef to <16 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r155 = fptosi <16 x half> undef to <16 x i16>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x half> undef to <16 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x half> undef to <16 x i32>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x half> undef to <16 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x half> undef to <16 x i64>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r250 = uitofp <8 x i1> undef to <8 x half>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r251 = sitofp <8 x i1> undef to <8 x half>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r252 = uitofp <8 x i8> undef to <8 x half>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r253 = sitofp <8 x i8> undef to <8 x half>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r254 = uitofp <8 x i16> undef to <8 x half>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r255 = sitofp <8 x i16> undef to <8 x half>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r256 = uitofp <8 x i32> undef to <8 x half>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r257 = sitofp <8 x i32> undef to <8 x half>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r258 = uitofp <8 x i64> undef to <8 x half>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r259 = sitofp <8 x i64> undef to <8 x half>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r260 = uitofp <16 x i1> undef to <16 x half>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r261 = sitofp <16 x i1> undef to <16 x half>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r262 = uitofp <16 x i8> undef to <16 x half>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r263 = sitofp <16 x i8> undef to <16 x half>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r264 = uitofp <16 x i16> undef to <16 x half>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r265 = sitofp <16 x i16> undef to <16 x half>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r266 = uitofp <16 x i32> undef to <16 x half>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r267 = sitofp <16 x i32> undef to <16 x half>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r268 = uitofp <16 x i64> undef to <16 x half>
-; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r269 = sitofp <16 x i64> undef to <16 x half>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r30 = fptoui half poison to i1
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r31 = fptosi half poison to i1
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r32 = fptoui half poison to i8
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r33 = fptosi half poison to i8
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r34 = fptoui half poison to i16
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r35 = fptosi half poison to i16
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r36 = fptoui half poison to i32
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r37 = fptosi half poison to i32
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r38 = fptoui half poison to i64
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r39 = fptosi half poison to i64
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r90 = fptoui <2 x half> poison to <2 x i1>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r91 = fptosi <2 x half> poison to <2 x i1>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r92 = fptoui <2 x half> poison to <2 x i8>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r93 = fptosi <2 x half> poison to <2 x i8>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r94 = fptoui <2 x half> poison to <2 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r95 = fptosi <2 x half> poison to <2 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r96 = fptoui <2 x half> poison to <2 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r97 = fptosi <2 x half> poison to <2 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r98 = fptoui <2 x half> poison to <2 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r99 = fptosi <2 x half> poison to <2 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r110 = fptoui <4 x half> poison to <4 x i1>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r111 = fptosi <4 x half> poison to <4 x i1>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r112 = fptoui <4 x half> poison to <4 x i8>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r113 = fptosi <4 x half> poison to <4 x i8>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r114 = fptoui <4 x half> poison to <4 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r115 = fptosi <4 x half> poison to <4 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r116 = fptoui <4 x half> poison to <4 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r117 = fptosi <4 x half> poison to <4 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r118 = fptoui <4 x half> poison to <4 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r119 = fptosi <4 x half> poison to <4 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r130 = fptoui <8 x half> poison to <8 x i1>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r131 = fptosi <8 x half> poison to <8 x i1>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r132 = fptoui <8 x half> poison to <8 x i8>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r133 = fptosi <8 x half> poison to <8 x i8>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r134 = fptoui <8 x half> poison to <8 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r135 = fptosi <8 x half> poison to <8 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r136 = fptoui <8 x half> poison to <8 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r137 = fptosi <8 x half> poison to <8 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x half> poison to <8 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x half> poison to <8 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r150 = fptoui <16 x half> poison to <16 x i1>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r151 = fptosi <16 x half> poison to <16 x i1>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r152 = fptoui <16 x half> poison to <16 x i8>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r153 = fptosi <16 x half> poison to <16 x i8>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r154 = fptoui <16 x half> poison to <16 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r155 = fptosi <16 x half> poison to <16 x i16>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x half> poison to <16 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x half> poison to <16 x i32>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x half> poison to <16 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x half> poison to <16 x i64>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r250 = uitofp <8 x i1> poison to <8 x half>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r251 = sitofp <8 x i1> poison to <8 x half>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r252 = uitofp <8 x i8> poison to <8 x half>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r253 = sitofp <8 x i8> poison to <8 x half>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r254 = uitofp <8 x i16> poison to <8 x half>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r255 = sitofp <8 x i16> poison to <8 x half>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r256 = uitofp <8 x i32> poison to <8 x half>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r257 = sitofp <8 x i32> poison to <8 x half>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r258 = uitofp <8 x i64> poison to <8 x half>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r259 = sitofp <8 x i64> poison to <8 x half>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r260 = uitofp <16 x i1> poison to <16 x half>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r261 = sitofp <16 x i1> poison to <16 x half>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r262 = uitofp <16 x i8> poison to <16 x half>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r263 = sitofp <16 x i8> poison to <16 x half>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r264 = uitofp <16 x i16> poison to <16 x half>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r265 = sitofp <16 x i16> poison to <16 x half>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r266 = uitofp <16 x i32> poison to <16 x half>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r267 = sitofp <16 x i32> poison to <16 x half>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r268 = uitofp <16 x i64> poison to <16 x half>
+; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r269 = sitofp <16 x i64> poison to <16 x half>
; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; FIXED-MIN-256-LABEL: 'fp16cast'
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r30 = fptoui half undef to i1
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r31 = fptosi half undef to i1
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r32 = fptoui half undef to i8
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r33 = fptosi half undef to i8
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r34 = fptoui half undef to i16
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r35 = fptosi half undef to i16
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r36 = fptoui half undef to i32
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r37 = fptosi half undef to i32
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r38 = fptoui half undef to i64
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r39 = fptosi half undef to i64
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r90 = fptoui <2 x half> undef to <2 x i1>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r91 = fptosi <2 x half> undef to <2 x i1>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r92 = fptoui <2 x half> undef to <2 x i8>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r93 = fptosi <2 x half> undef to <2 x i8>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r94 = fptoui <2 x half> undef to <2 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r95 = fptosi <2 x half> undef to <2 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r96 = fptoui <2 x half> undef to <2 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r97 = fptosi <2 x half> undef to <2 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x half> undef to <2 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x half> undef to <2 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r110 = fptoui <4 x half> undef to <4 x i1>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r111 = fptosi <4 x half> undef to <4 x i1>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r112 = fptoui <4 x half> undef to <4 x i8>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r113 = fptosi <4 x half> undef to <4 x i8>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r114 = fptoui <4 x half> undef to <4 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r115 = fptosi <4 x half> undef to <4 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r116 = fptoui <4 x half> undef to <4 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r117 = fptosi <4 x half> undef to <4 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r118 = fptoui <4 x half> undef to <4 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r119 = fptosi <4 x half> undef to <4 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r130 = fptoui <8 x half> undef to <8 x i1>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r131 = fptosi <8 x half> undef to <8 x i1>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r132 = fptoui <8 x half> undef to <8 x i8>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r133 = fptosi <8 x half> undef to <8 x i8>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r134 = fptoui <8 x half> undef to <8 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r135 = fptosi <8 x half> undef to <8 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r136 = fptoui <8 x half> undef to <8 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r137 = fptosi <8 x half> undef to <8 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x half> undef to <8 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x half> undef to <8 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r150 = fptoui <16 x half> undef to <16 x i1>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r151 = fptosi <16 x half> undef to <16 x i1>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r152 = fptoui <16 x half> undef to <16 x i8>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r153 = fptosi <16 x half> undef to <16 x i8>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r154 = fptoui <16 x half> undef to <16 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r155 = fptosi <16 x half> undef to <16 x i16>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x half> undef to <16 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x half> undef to <16 x i32>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x half> undef to <16 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x half> undef to <16 x i64>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r250 = uitofp <8 x i1> undef to <8 x half>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r251 = sitofp <8 x i1> undef to <8 x half>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r252 = uitofp <8 x i8> undef to <8 x half>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r253 = sitofp <8 x i8> undef to <8 x half>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r254 = uitofp <8 x i16> undef to <8 x half>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r255 = sitofp <8 x i16> undef to <8 x half>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r256 = uitofp <8 x i32> undef to <8 x half>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r257 = sitofp <8 x i32> undef to <8 x half>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r258 = uitofp <8 x i64> undef to <8 x half>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r259 = sitofp <8 x i64> undef to <8 x half>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r260 = uitofp <16 x i1> undef to <16 x half>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r261 = sitofp <16 x i1> undef to <16 x half>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r262 = uitofp <16 x i8> undef to <16 x half>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r263 = sitofp <16 x i8> undef to <16 x half>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r264 = uitofp <16 x i16> undef to <16 x half>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r265 = sitofp <16 x i16> undef to <16 x half>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r266 = uitofp <16 x i32> undef to <16 x half>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r267 = sitofp <16 x i32> undef to <16 x half>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r268 = uitofp <16 x i64> undef to <16 x half>
-; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r269 = sitofp <16 x i64> undef to <16 x half>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r30 = fptoui half poison to i1
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r31 = fptosi half poison to i1
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r32 = fptoui half poison to i8
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r33 = fptosi half poison to i8
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r34 = fptoui half poison to i16
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r35 = fptosi half poison to i16
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r36 = fptoui half poison to i32
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r37 = fptosi half poison to i32
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r38 = fptoui half poison to i64
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r39 = fptosi half poison to i64
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r90 = fptoui <2 x half> poison to <2 x i1>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r91 = fptosi <2 x half> poison to <2 x i1>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r92 = fptoui <2 x half> poison to <2 x i8>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r93 = fptosi <2 x half> poison to <2 x i8>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r94 = fptoui <2 x half> poison to <2 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r95 = fptosi <2 x half> poison to <2 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r96 = fptoui <2 x half> poison to <2 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r97 = fptosi <2 x half> poison to <2 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x half> poison to <2 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x half> poison to <2 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r110 = fptoui <4 x half> poison to <4 x i1>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r111 = fptosi <4 x half> poison to <4 x i1>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r112 = fptoui <4 x half> poison to <4 x i8>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r113 = fptosi <4 x half> poison to <4 x i8>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r114 = fptoui <4 x half> poison to <4 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r115 = fptosi <4 x half> poison to <4 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r116 = fptoui <4 x half> poison to <4 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r117 = fptosi <4 x half> poison to <4 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r118 = fptoui <4 x half> poison to <4 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r119 = fptosi <4 x half> poison to <4 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r130 = fptoui <8 x half> poison to <8 x i1>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r131 = fptosi <8 x half> poison to <8 x i1>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r132 = fptoui <8 x half> poison to <8 x i8>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r133 = fptosi <8 x half> poison to <8 x i8>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r134 = fptoui <8 x half> poison to <8 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r135 = fptosi <8 x half> poison to <8 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r136 = fptoui <8 x half> poison to <8 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r137 = fptosi <8 x half> poison to <8 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x half> poison to <8 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x half> poison to <8 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r150 = fptoui <16 x half> poison to <16 x i1>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r151 = fptosi <16 x half> poison to <16 x i1>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r152 = fptoui <16 x half> poison to <16 x i8>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r153 = fptosi <16 x half> poison to <16 x i8>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r154 = fptoui <16 x half> poison to <16 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r155 = fptosi <16 x half> poison to <16 x i16>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x half> poison to <16 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x half> poison to <16 x i32>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x half> poison to <16 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x half> poison to <16 x i64>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r250 = uitofp <8 x i1> poison to <8 x half>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r251 = sitofp <8 x i1> poison to <8 x half>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r252 = uitofp <8 x i8> poison to <8 x half>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r253 = sitofp <8 x i8> poison to <8 x half>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r254 = uitofp <8 x i16> poison to <8 x half>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r255 = sitofp <8 x i16> poison to <8 x half>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r256 = uitofp <8 x i32> poison to <8 x half>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r257 = sitofp <8 x i32> poison to <8 x half>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r258 = uitofp <8 x i64> poison to <8 x half>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r259 = sitofp <8 x i64> poison to <8 x half>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r260 = uitofp <16 x i1> poison to <16 x half>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r261 = sitofp <16 x i1> poison to <16 x half>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r262 = uitofp <16 x i8> poison to <16 x half>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r263 = sitofp <16 x i8> poison to <16 x half>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r264 = uitofp <16 x i16> poison to <16 x half>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r265 = sitofp <16 x i16> poison to <16 x half>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r266 = uitofp <16 x i32> poison to <16 x half>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r267 = sitofp <16 x i32> poison to <16 x half>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r268 = uitofp <16 x i64> poison to <16 x half>
+; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r269 = sitofp <16 x i64> poison to <16 x half>
; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; FIXED-MIN-2048-LABEL: 'fp16cast'
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r30 = fptoui half undef to i1
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r31 = fptosi half undef to i1
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r32 = fptoui half undef to i8
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r33 = fptosi half undef to i8
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r34 = fptoui half undef to i16
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r35 = fptosi half undef to i16
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r36 = fptoui half undef to i32
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r37 = fptosi half undef to i32
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r38 = fptoui half undef to i64
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r39 = fptosi half undef to i64
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r90 = fptoui <2 x half> undef to <2 x i1>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r91 = fptosi <2 x half> undef to <2 x i1>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r92 = fptoui <2 x half> undef to <2 x i8>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r93 = fptosi <2 x half> undef to <2 x i8>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r94 = fptoui <2 x half> undef to <2 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r95 = fptosi <2 x half> undef to <2 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r96 = fptoui <2 x half> undef to <2 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r97 = fptosi <2 x half> undef to <2 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x half> undef to <2 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x half> undef to <2 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r110 = fptoui <4 x half> undef to <4 x i1>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r111 = fptosi <4 x half> undef to <4 x i1>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r112 = fptoui <4 x half> undef to <4 x i8>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r113 = fptosi <4 x half> undef to <4 x i8>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r114 = fptoui <4 x half> undef to <4 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r115 = fptosi <4 x half> undef to <4 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r116 = fptoui <4 x half> undef to <4 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r117 = fptosi <4 x half> undef to <4 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r118 = fptoui <4 x half> undef to <4 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r119 = fptosi <4 x half> undef to <4 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r130 = fptoui <8 x half> undef to <8 x i1>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r131 = fptosi <8 x half> undef to <8 x i1>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r132 = fptoui <8 x half> undef to <8 x i8>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r133 = fptosi <8 x half> undef to <8 x i8>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r134 = fptoui <8 x half> undef to <8 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r135 = fptosi <8 x half> undef to <8 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r136 = fptoui <8 x half> undef to <8 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r137 = fptosi <8 x half> undef to <8 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r138 = fptoui <8 x half> undef to <8 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r139 = fptosi <8 x half> undef to <8 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r150 = fptoui <16 x half> undef to <16 x i1>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r151 = fptosi <16 x half> undef to <16 x i1>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r152 = fptoui <16 x half> undef to <16 x i8>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r153 = fptosi <16 x half> undef to <16 x i8>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r154 = fptoui <16 x half> undef to <16 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r155 = fptosi <16 x half> undef to <16 x i16>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r156 = fptoui <16 x half> undef to <16 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r157 = fptosi <16 x half> undef to <16 x i32>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r158 = fptoui <16 x half> undef to <16 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r159 = fptosi <16 x half> undef to <16 x i64>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r250 = uitofp <8 x i1> undef to <8 x half>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r251 = sitofp <8 x i1> undef to <8 x half>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r252 = uitofp <8 x i8> undef to <8 x half>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r253 = sitofp <8 x i8> undef to <8 x half>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r254 = uitofp <8 x i16> undef to <8 x half>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r255 = sitofp <8 x i16> undef to <8 x half>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r256 = uitofp <8 x i32> undef to <8 x half>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r257 = sitofp <8 x i32> undef to <8 x half>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r258 = uitofp <8 x i64> undef to <8 x half>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r259 = sitofp <8 x i64> undef to <8 x half>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r260 = uitofp <16 x i1> undef to <16 x half>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r261 = sitofp <16 x i1> undef to <16 x half>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r262 = uitofp <16 x i8> undef to <16 x half>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r263 = sitofp <16 x i8> undef to <16 x half>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r264 = uitofp <16 x i16> undef to <16 x half>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r265 = sitofp <16 x i16> undef to <16 x half>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r266 = uitofp <16 x i32> undef to <16 x half>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r267 = sitofp <16 x i32> undef to <16 x half>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r268 = uitofp <16 x i64> undef to <16 x half>
-; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r269 = sitofp <16 x i64> undef to <16 x half>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r30 = fptoui half poison to i1
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r31 = fptosi half poison to i1
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r32 = fptoui half poison to i8
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r33 = fptosi half poison to i8
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r34 = fptoui half poison to i16
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r35 = fptosi half poison to i16
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r36 = fptoui half poison to i32
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r37 = fptosi half poison to i32
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r38 = fptoui half poison to i64
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r39 = fptosi half poison to i64
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r90 = fptoui <2 x half> poison to <2 x i1>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r91 = fptosi <2 x half> poison to <2 x i1>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r92 = fptoui <2 x half> poison to <2 x i8>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r93 = fptosi <2 x half> poison to <2 x i8>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r94 = fptoui <2 x half> poison to <2 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r95 = fptosi <2 x half> poison to <2 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r96 = fptoui <2 x half> poison to <2 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r97 = fptosi <2 x half> poison to <2 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x half> poison to <2 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x half> poison to <2 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r110 = fptoui <4 x half> poison to <4 x i1>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r111 = fptosi <4 x half> poison to <4 x i1>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r112 = fptoui <4 x half> poison to <4 x i8>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r113 = fptosi <4 x half> poison to <4 x i8>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r114 = fptoui <4 x half> poison to <4 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r115 = fptosi <4 x half> poison to <4 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r116 = fptoui <4 x half> poison to <4 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r117 = fptosi <4 x half> poison to <4 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r118 = fptoui <4 x half> poison to <4 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r119 = fptosi <4 x half> poison to <4 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r130 = fptoui <8 x half> poison to <8 x i1>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r131 = fptosi <8 x half> poison to <8 x i1>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r132 = fptoui <8 x half> poison to <8 x i8>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r133 = fptosi <8 x half> poison to <8 x i8>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r134 = fptoui <8 x half> poison to <8 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r135 = fptosi <8 x half> poison to <8 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r136 = fptoui <8 x half> poison to <8 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r137 = fptosi <8 x half> poison to <8 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r138 = fptoui <8 x half> poison to <8 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r139 = fptosi <8 x half> poison to <8 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r150 = fptoui <16 x half> poison to <16 x i1>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r151 = fptosi <16 x half> poison to <16 x i1>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r152 = fptoui <16 x half> poison to <16 x i8>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r153 = fptosi <16 x half> poison to <16 x i8>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r154 = fptoui <16 x half> poison to <16 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r155 = fptosi <16 x half> poison to <16 x i16>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r156 = fptoui <16 x half> poison to <16 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r157 = fptosi <16 x half> poison to <16 x i32>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r158 = fptoui <16 x half> poison to <16 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r159 = fptosi <16 x half> poison to <16 x i64>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r250 = uitofp <8 x i1> poison to <8 x half>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r251 = sitofp <8 x i1> poison to <8 x half>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r252 = uitofp <8 x i8> poison to <8 x half>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r253 = sitofp <8 x i8> poison to <8 x half>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r254 = uitofp <8 x i16> poison to <8 x half>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r255 = sitofp <8 x i16> poison to <8 x half>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r256 = uitofp <8 x i32> poison to <8 x half>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r257 = sitofp <8 x i32> poison to <8 x half>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r258 = uitofp <8 x i64> poison to <8 x half>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r259 = sitofp <8 x i64> poison to <8 x half>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r260 = uitofp <16 x i1> poison to <16 x half>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r261 = sitofp <16 x i1> poison to <16 x half>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r262 = uitofp <16 x i8> poison to <16 x half>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r263 = sitofp <16 x i8> poison to <16 x half>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r264 = uitofp <16 x i16> poison to <16 x half>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r265 = sitofp <16 x i16> poison to <16 x half>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r266 = uitofp <16 x i32> poison to <16 x half>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r267 = sitofp <16 x i32> poison to <16 x half>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r268 = uitofp <16 x i64> poison to <16 x half>
+; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r269 = sitofp <16 x i64> poison to <16 x half>
; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
- %r30 = fptoui half undef to i1
- %r31 = fptosi half undef to i1
- %r32 = fptoui half undef to i8
- %r33 = fptosi half undef to i8
- %r34 = fptoui half undef to i16
- %r35 = fptosi half undef to i16
- %r36 = fptoui half undef to i32
- %r37 = fptosi half undef to i32
- %r38 = fptoui half undef to i64
- %r39 = fptosi half undef to i64
+ %r30 = fptoui half poison to i1
+ %r31 = fptosi half poison to i1
+ %r32 = fptoui half poison to i8
+ %r33 = fptosi half poison to i8
+ %r34 = fptoui half poison to i16
+ %r35 = fptosi half poison to i16
+ %r36 = fptoui half poison to i32
+ %r37 = fptosi half poison to i32
+ %r38 = fptoui half poison to i64
+ %r39 = fptosi half poison to i64
- %r90 = fptoui <2 x half> undef to <2 x i1>
- %r91 = fptosi <2 x half> undef to <2 x i1>
- %r92 = fptoui <2 x half> undef to <2 x i8>
- %r93 = fptosi <2 x half> undef to <2 x i8>
- %r94 = fptoui <2 x half> undef to <2 x i16>
- %r95 = fptosi <2 x half> undef to <2 x i16>
- %r96 = fptoui <2 x half> undef to <2 x i32>
- %r97 = fptosi <2 x half> undef to <2 x i32>
- %r98 = fptoui <2 x half> undef to <2 x i64>
- %r99 = fptosi <2 x half> undef to <2 x i64>
+ %r90 = fptoui <2 x half> poison to <2 x i1>
+ %r91 = fptosi <2 x half> poison to <2 x i1>
+ %r92 = fptoui <2 x half> poison to <2 x i8>
+ %r93 = fptosi <2 x half> poison to <2 x i8>
+ %r94 = fptoui <2 x half> poison to <2 x i16>
+ %r95 = fptosi <2 x half> poison to <2 x i16>
+ %r96 = fptoui <2 x half> poison to <2 x i32>
+ %r97 = fptosi <2 x half> poison to <2 x i32>
+ %r98 = fptoui <2 x half> poison to <2 x i64>
+ %r99 = fptosi <2 x half> poison to <2 x i64>
- %r110 = fptoui <4 x half> undef to <4 x i1>
- %r111 = fptosi <4 x half> undef to <4 x i1>
- %r112 = fptoui <4 x half> undef to <4 x i8>
- %r113 = fptosi <4 x half> undef to <4 x i8>
- %r114 = fptoui <4 x half> undef to <4 x i16>
- %r115 = fptosi <4 x half> undef to <4 x i16>
- %r116 = fptoui <4 x half> undef to <4 x i32>
- %r117 = fptosi <4 x half> undef to <4 x i32>
- %r118 = fptoui <4 x half> undef to <4 x i64>
- %r119 = fptosi <4 x half> undef to <4 x i64>
+ %r110 = fptoui <4 x half> poison to <4 x i1>
+ %r111 = fptosi <4 x half> poison to <4 x i1>
+ %r112 = fptoui <4 x half> poison to <4 x i8>
+ %r113 = fptosi <4 x half> poison to <4 x i8>
+ %r114 = fptoui <4 x half> poison to <4 x i16>
+ %r115 = fptosi <4 x half> poison to <4 x i16>
+ %r116 = fptoui <4 x half> poison to <4 x i32>
+ %r117 = fptosi <4 x half> poison to <4 x i32>
+ %r118 = fptoui <4 x half> poison to <4 x i64>
+ %r119 = fptosi <4 x half> poison to <4 x i64>
- %r130 = fptoui <8 x half> undef to <8 x i1>
- %r131 = fptosi <8 x half> undef to <8 x i1>
- %r132 = fptoui <8 x half> undef to <8 x i8>
- %r133 = fptosi <8 x half> undef to <8 x i8>
- %r134 = fptoui <8 x half> undef to <8 x i16>
- %r135 = fptosi <8 x half> undef to <8 x i16>
- %r136 = fptoui <8 x half> undef to <8 x i32>
- %r137 = fptosi <8 x half> undef to <8 x i32>
- %r138 = fptoui <8 x half> undef to <8 x i64>
- %r139 = fptosi <8 x half> undef to <8 x i64>
+ %r130 = fptoui <8 x half> poison to <8 x i1>
+ %r131 = fptosi <8 x half> poison to <8 x i1>
+ %r132 = fptoui <8 x half> poison to <8 x i8>
+ %r133 = fptosi <8 x half> poison to <8 x i8>
+ %r134 = fptoui <8 x half> poison to <8 x i16>
+ %r135 = fptosi <8 x half> poison to <8 x i16>
+ %r136 = fptoui <8 x half> poison to <8 x i32>
+ %r137 = fptosi <8 x half> poison to <8 x i32>
+ %r138 = fptoui <8 x half> poison to <8 x i64>
+ %r139 = fptosi <8 x half> poison to <8 x i64>
- %r150 = fptoui <16 x half> undef to <16 x i1>
- %r151 = fptosi <16 x half> undef to <16 x i1>
- %r152 = fptoui <16 x half> undef to <16 x i8>
- %r153 = fptosi <16 x half> undef to <16 x i8>
- %r154 = fptoui <16 x half> undef to <16 x i16>
- %r155 = fptosi <16 x half> undef to <16 x i16>
- %r156 = fptoui <16 x half> undef to <16 x i32>
- %r157 = fptosi <16 x half> undef to <16 x i32>
- %r158 = fptoui <16 x half> undef to <16 x i64>
- %r159 = fptosi <16 x half> undef to <16 x i64>
+ %r150 = fptoui <16 x half> poison to <16 x i1>
+ %r151 = fptosi <16 x half> poison to <16 x i1>
+ %r152 = fptoui <16 x half> poison to <16 x i8>
+ %r153 = fptosi <16 x half> poison to <16 x i8>
+ %r154 = fptoui <16 x half> poison to <16 x i16>
+ %r155 = fptosi <16 x half> poison to <16 x i16>
+ %r156 = fptoui <16 x half> poison to <16 x i32>
+ %r157 = fptosi <16 x half> poison to <16 x i32>
+ %r158 = fptoui <16 x half> poison to <16 x i64>
+ %r159 = fptosi <16 x half> poison to <16 x i64>
- %r250 = uitofp <8 x i1> undef to <8 x half>
- %r251 = sitofp <8 x i1> undef to <8 x half>
- %r252 = uitofp <8 x i8> undef to <8 x half>
- %r253 = sitofp <8 x i8> undef to <8 x half>
- %r254 = uitofp <8 x i16> undef to <8 x half>
- %r255 = sitofp <8 x i16> undef to <8 x half>
- %r256 = uitofp <8 x i32> undef to <8 x half>
- %r257 = sitofp <8 x i32> undef to <8 x half>
- %r258 = uitofp <8 x i64> undef to <8 x half>
- %r259 = sitofp <8 x i64> undef to <8 x half>
+ %r250 = uitofp <8 x i1> poison to <8 x half>
+ %r251 = sitofp <8 x i1> poison to <8 x half>
+ %r252 = uitofp <8 x i8> poison to <8 x half>
+ %r253 = sitofp <8 x i8> poison to <8 x half>
+ %r254 = uitofp <8 x i16> poison to <8 x half>
+ %r255 = sitofp <8 x i16> poison to <8 x half>
+ %r256 = uitofp <8 x i32> poison to <8 x half>
+ %r257 = sitofp <8 x i32> poison to <8 x half>
+ %r258 = uitofp <8 x i64> poison to <8 x half>
+ %r259 = sitofp <8 x i64> poison to <8 x half>
- %r260 = uitofp <16 x i1> undef to <16 x half>
- %r261 = sitofp <16 x i1> undef to <16 x half>
- %r262 = uitofp <16 x i8> undef to <16 x half>
- %r263 = sitofp <16 x i8> undef to <16 x half>
- %r264 = uitofp <16 x i16> undef to <16 x half>
- %r265 = sitofp <16 x i16> undef to <16 x half>
- %r266 = uitofp <16 x i32> undef to <16 x half>
- %r267 = sitofp <16 x i32> undef to <16 x half>
- %r268 = uitofp <16 x i64> undef to <16 x half>
- %r269 = sitofp <16 x i64> undef to <16 x half>
+ %r260 = uitofp <16 x i1> poison to <16 x half>
+ %r261 = sitofp <16 x i1> poison to <16 x half>
+ %r262 = uitofp <16 x i8> poison to <16 x half>
+ %r263 = sitofp <16 x i8> poison to <16 x half>
+ %r264 = uitofp <16 x i16> poison to <16 x half>
+ %r265 = sitofp <16 x i16> poison to <16 x half>
+ %r266 = uitofp <16 x i32> poison to <16 x half>
+ %r267 = sitofp <16 x i32> poison to <16 x half>
+ %r268 = uitofp <16 x i64> poison to <16 x half>
+ %r269 = sitofp <16 x i64> poison to <16 x half>
ret void
}
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-ext.ll b/llvm/test/Analysis/CostModel/AArch64/sve-ext.ll
index b887654..91aaea2 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-ext.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-ext.ll
@@ -5,49 +5,49 @@ target triple = "aarch64-unknown-linux-gnu"
define void @sve_ext() {
; CHECK-LABEL: 'sve_ext'
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv16_i8_to_i16 = zext <vscale x 16 x i8> undef to <vscale x 16 x i16>
-; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv16_i8_to_i32 = zext <vscale x 16 x i8> undef to <vscale x 16 x i32>
-; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv16_i8_to_i64 = zext <vscale x 16 x i8> undef to <vscale x 16 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv8_i16_to_i32 = zext <vscale x 8 x i16> undef to <vscale x 8 x i32>
-; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv8_i16_to_i64 = zext <vscale x 8 x i16> undef to <vscale x 8 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv4_i32_to_i64 = zext <vscale x 4 x i32> undef to <vscale x 4 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv4_i8_to_i64 = zext <vscale x 4 x i8> undef to <vscale x 4 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv8_i8_to_i32 = zext <vscale x 8 x i8> undef to <vscale x 8 x i32>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv4_i16_to_i64 = zext <vscale x 4 x i16> undef to <vscale x 4 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv8_i8_to_i64 = zext <vscale x 8 x i8> undef to <vscale x 8 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv16_i8_to_i16 = sext <vscale x 16 x i8> undef to <vscale x 16 x i16>
-; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv16_i8_to_i32 = sext <vscale x 16 x i8> undef to <vscale x 16 x i32>
-; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv16_i8_to_i64 = sext <vscale x 16 x i8> undef to <vscale x 16 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv8_i16_to_i32 = sext <vscale x 8 x i16> undef to <vscale x 8 x i32>
-; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv8_i16_to_i64 = sext <vscale x 8 x i16> undef to <vscale x 8 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv4_i32_to_i64 = sext <vscale x 4 x i32> undef to <vscale x 4 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv4_i8_to_i64 = sext <vscale x 4 x i8> undef to <vscale x 4 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv8_i8_to_i32 = sext <vscale x 8 x i8> undef to <vscale x 8 x i32>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv4_i16_to_i64 = sext <vscale x 4 x i16> undef to <vscale x 4 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv8_i8_to_i64 = sext <vscale x 8 x i8> undef to <vscale x 8 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv16_i8_to_i16 = zext <vscale x 16 x i8> poison to <vscale x 16 x i16>
+; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv16_i8_to_i32 = zext <vscale x 16 x i8> poison to <vscale x 16 x i32>
+; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv16_i8_to_i64 = zext <vscale x 16 x i8> poison to <vscale x 16 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv8_i16_to_i32 = zext <vscale x 8 x i16> poison to <vscale x 8 x i32>
+; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv8_i16_to_i64 = zext <vscale x 8 x i16> poison to <vscale x 8 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv4_i32_to_i64 = zext <vscale x 4 x i32> poison to <vscale x 4 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv4_i8_to_i64 = zext <vscale x 4 x i8> poison to <vscale x 4 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv8_i8_to_i32 = zext <vscale x 8 x i8> poison to <vscale x 8 x i32>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv4_i16_to_i64 = zext <vscale x 4 x i16> poison to <vscale x 4 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv8_i8_to_i64 = zext <vscale x 8 x i8> poison to <vscale x 8 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv16_i8_to_i16 = sext <vscale x 16 x i8> poison to <vscale x 16 x i16>
+; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv16_i8_to_i32 = sext <vscale x 16 x i8> poison to <vscale x 16 x i32>
+; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv16_i8_to_i64 = sext <vscale x 16 x i8> poison to <vscale x 16 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv8_i16_to_i32 = sext <vscale x 8 x i16> poison to <vscale x 8 x i32>
+; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv8_i16_to_i64 = sext <vscale x 8 x i16> poison to <vscale x 8 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv4_i32_to_i64 = sext <vscale x 4 x i32> poison to <vscale x 4 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv4_i8_to_i64 = sext <vscale x 4 x i8> poison to <vscale x 4 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv8_i8_to_i32 = sext <vscale x 8 x i8> poison to <vscale x 8 x i32>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv4_i16_to_i64 = sext <vscale x 4 x i16> poison to <vscale x 4 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv8_i8_to_i64 = sext <vscale x 8 x i8> poison to <vscale x 8 x i64>
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
- %zext_nxv16_i8_to_i16 = zext <vscale x 16 x i8> undef to <vscale x 16 x i16>
- %zext_nxv16_i8_to_i32 = zext <vscale x 16 x i8> undef to <vscale x 16 x i32>
- %zext_nxv16_i8_to_i64 = zext <vscale x 16 x i8> undef to <vscale x 16 x i64>
- %zext_nxv8_i16_to_i32 = zext <vscale x 8 x i16> undef to <vscale x 8 x i32>
- %zext_nxv8_i16_to_i64 = zext <vscale x 8 x i16> undef to <vscale x 8 x i64>
- %zext_nxv4_i32_to_i64 = zext <vscale x 4 x i32> undef to <vscale x 4 x i64>
- %zext_nxv4_i8_to_i64 = zext <vscale x 4 x i8> undef to <vscale x 4 x i64>
- %zext_nxv8_i8_to_i32 = zext <vscale x 8 x i8> undef to <vscale x 8 x i32>
- %zext_nxv4_i16_to_i64 = zext <vscale x 4 x i16> undef to <vscale x 4 x i64>
- %zext_nxv8_i8_to_i64 = zext <vscale x 8 x i8> undef to <vscale x 8 x i64>
+ %zext_nxv16_i8_to_i16 = zext <vscale x 16 x i8> poison to <vscale x 16 x i16>
+ %zext_nxv16_i8_to_i32 = zext <vscale x 16 x i8> poison to <vscale x 16 x i32>
+ %zext_nxv16_i8_to_i64 = zext <vscale x 16 x i8> poison to <vscale x 16 x i64>
+ %zext_nxv8_i16_to_i32 = zext <vscale x 8 x i16> poison to <vscale x 8 x i32>
+ %zext_nxv8_i16_to_i64 = zext <vscale x 8 x i16> poison to <vscale x 8 x i64>
+ %zext_nxv4_i32_to_i64 = zext <vscale x 4 x i32> poison to <vscale x 4 x i64>
+ %zext_nxv4_i8_to_i64 = zext <vscale x 4 x i8> poison to <vscale x 4 x i64>
+ %zext_nxv8_i8_to_i32 = zext <vscale x 8 x i8> poison to <vscale x 8 x i32>
+ %zext_nxv4_i16_to_i64 = zext <vscale x 4 x i16> poison to <vscale x 4 x i64>
+ %zext_nxv8_i8_to_i64 = zext <vscale x 8 x i8> poison to <vscale x 8 x i64>
- %sext_nxv16_i8_to_i16 = sext <vscale x 16 x i8> undef to <vscale x 16 x i16>
- %sext_nxv16_i8_to_i32 = sext <vscale x 16 x i8> undef to <vscale x 16 x i32>
- %sext_nxv16_i8_to_i64 = sext <vscale x 16 x i8> undef to <vscale x 16 x i64>
- %sext_nxv8_i16_to_i32 = sext <vscale x 8 x i16> undef to <vscale x 8 x i32>
- %sext_nxv8_i16_to_i64 = sext <vscale x 8 x i16> undef to <vscale x 8 x i64>
- %sext_nxv4_i32_to_i64 = sext <vscale x 4 x i32> undef to <vscale x 4 x i64>
- %sext_nxv4_i8_to_i64 = sext <vscale x 4 x i8> undef to <vscale x 4 x i64>
- %sext_nxv8_i8_to_i32 = sext <vscale x 8 x i8> undef to <vscale x 8 x i32>
- %sext_nxv4_i16_to_i64 = sext <vscale x 4 x i16> undef to <vscale x 4 x i64>
- %sext_nxv8_i8_to_i64 = sext <vscale x 8 x i8> undef to <vscale x 8 x i64>
+ %sext_nxv16_i8_to_i16 = sext <vscale x 16 x i8> poison to <vscale x 16 x i16>
+ %sext_nxv16_i8_to_i32 = sext <vscale x 16 x i8> poison to <vscale x 16 x i32>
+ %sext_nxv16_i8_to_i64 = sext <vscale x 16 x i8> poison to <vscale x 16 x i64>
+ %sext_nxv8_i16_to_i32 = sext <vscale x 8 x i16> poison to <vscale x 8 x i32>
+ %sext_nxv8_i16_to_i64 = sext <vscale x 8 x i16> poison to <vscale x 8 x i64>
+ %sext_nxv4_i32_to_i64 = sext <vscale x 4 x i32> poison to <vscale x 4 x i64>
+ %sext_nxv4_i8_to_i64 = sext <vscale x 4 x i8> poison to <vscale x 4 x i64>
+ %sext_nxv8_i8_to_i32 = sext <vscale x 8 x i8> poison to <vscale x 8 x i32>
+ %sext_nxv4_i16_to_i64 = sext <vscale x 4 x i16> poison to <vscale x 4 x i64>
+ %sext_nxv8_i8_to_i64 = sext <vscale x 8 x i8> poison to <vscale x 8 x i64>
ret void
}
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-fpext.ll b/llvm/test/Analysis/CostModel/AArch64/sve-fpext.ll
index 4ad0e3f..1e698b1 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-fpext.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-fpext.ll
@@ -6,49 +6,49 @@ target triple = "aarch64-unknown-linux-gnu"
define void @sve_fpext() {
; CHECK-LABEL: 'sve_fpext'
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nxv2_f16_to_f32 = fpext <vscale x 2 x half> undef to <vscale x 2 x float>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nxv4_f16_to_f32 = fpext <vscale x 4 x half> undef to <vscale x 4 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_to_f32 = fpext <vscale x 8 x half> undef to <vscale x 8 x float>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nxv2_f16_to_f64 = fpext <vscale x 2 x half> undef to <vscale x 2 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_to_f64 = fpext <vscale x 4 x half> undef to <vscale x 4 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_to_f64 = fpext <vscale x 8 x half> undef to <vscale x 8 x double>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nxv2_f32_to_f64 = fpext <vscale x 2 x float> undef to <vscale x 2 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f32_to_f64 = fpext <vscale x 4 x float> undef to <vscale x 4 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f32_to_f64 = fpext <vscale x 8 x float> undef to <vscale x 8 x double>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nxv2_f16_to_f32 = fpext <vscale x 2 x half> poison to <vscale x 2 x float>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nxv4_f16_to_f32 = fpext <vscale x 4 x half> poison to <vscale x 4 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_to_f32 = fpext <vscale x 8 x half> poison to <vscale x 8 x float>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nxv2_f16_to_f64 = fpext <vscale x 2 x half> poison to <vscale x 2 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_to_f64 = fpext <vscale x 4 x half> poison to <vscale x 4 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_to_f64 = fpext <vscale x 8 x half> poison to <vscale x 8 x double>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nxv2_f32_to_f64 = fpext <vscale x 2 x float> poison to <vscale x 2 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f32_to_f64 = fpext <vscale x 4 x float> poison to <vscale x 4 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f32_to_f64 = fpext <vscale x 8 x float> poison to <vscale x 8 x double>
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
- %nxv2_f16_to_f32 = fpext <vscale x 2 x half> undef to <vscale x 2 x float>
- %nxv4_f16_to_f32 = fpext <vscale x 4 x half> undef to <vscale x 4 x float>
- %nxv8_f16_to_f32 = fpext <vscale x 8 x half> undef to <vscale x 8 x float>
+ %nxv2_f16_to_f32 = fpext <vscale x 2 x half> poison to <vscale x 2 x float>
+ %nxv4_f16_to_f32 = fpext <vscale x 4 x half> poison to <vscale x 4 x float>
+ %nxv8_f16_to_f32 = fpext <vscale x 8 x half> poison to <vscale x 8 x float>
- %nxv2_f16_to_f64 = fpext <vscale x 2 x half> undef to <vscale x 2 x double>
- %nxv4_f16_to_f64 = fpext <vscale x 4 x half> undef to <vscale x 4 x double>
- %nxv8_f16_to_f64 = fpext <vscale x 8 x half> undef to <vscale x 8 x double>
+ %nxv2_f16_to_f64 = fpext <vscale x 2 x half> poison to <vscale x 2 x double>
+ %nxv4_f16_to_f64 = fpext <vscale x 4 x half> poison to <vscale x 4 x double>
+ %nxv8_f16_to_f64 = fpext <vscale x 8 x half> poison to <vscale x 8 x double>
- %nxv2_f32_to_f64 = fpext <vscale x 2 x float> undef to <vscale x 2 x double>
- %nxv4_f32_to_f64 = fpext <vscale x 4 x float> undef to <vscale x 4 x double>
- %nxv8_f32_to_f64 = fpext <vscale x 8 x float> undef to <vscale x 8 x double>
+ %nxv2_f32_to_f64 = fpext <vscale x 2 x float> poison to <vscale x 2 x double>
+ %nxv4_f32_to_f64 = fpext <vscale x 4 x float> poison to <vscale x 4 x double>
+ %nxv8_f32_to_f64 = fpext <vscale x 8 x float> poison to <vscale x 8 x double>
ret void
}
define void @sve_fpext_bf16() {
; CHECK-LABEL: 'sve_fpext_bf16'
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nxv2_f16_to_f32 = fpext <vscale x 2 x bfloat> undef to <vscale x 2 x float>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nxv4_f16_to_f32 = fpext <vscale x 4 x bfloat> undef to <vscale x 4 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_to_f32 = fpext <vscale x 8 x bfloat> undef to <vscale x 8 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nxv2_f16_to_f64 = fpext <vscale x 2 x bfloat> undef to <vscale x 2 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_to_f64 = fpext <vscale x 4 x bfloat> undef to <vscale x 4 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_to_f64 = fpext <vscale x 8 x bfloat> undef to <vscale x 8 x double>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nxv2_f16_to_f32 = fpext <vscale x 2 x bfloat> poison to <vscale x 2 x float>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nxv4_f16_to_f32 = fpext <vscale x 4 x bfloat> poison to <vscale x 4 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_to_f32 = fpext <vscale x 8 x bfloat> poison to <vscale x 8 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nxv2_f16_to_f64 = fpext <vscale x 2 x bfloat> poison to <vscale x 2 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_to_f64 = fpext <vscale x 4 x bfloat> poison to <vscale x 4 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_to_f64 = fpext <vscale x 8 x bfloat> poison to <vscale x 8 x double>
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
- %nxv2_f16_to_f32 = fpext <vscale x 2 x bfloat> undef to <vscale x 2 x float>
- %nxv4_f16_to_f32 = fpext <vscale x 4 x bfloat> undef to <vscale x 4 x float>
- %nxv8_f16_to_f32 = fpext <vscale x 8 x bfloat> undef to <vscale x 8 x float>
+ %nxv2_f16_to_f32 = fpext <vscale x 2 x bfloat> poison to <vscale x 2 x float>
+ %nxv4_f16_to_f32 = fpext <vscale x 4 x bfloat> poison to <vscale x 4 x float>
+ %nxv8_f16_to_f32 = fpext <vscale x 8 x bfloat> poison to <vscale x 8 x float>
- %nxv2_f16_to_f64 = fpext <vscale x 2 x bfloat> undef to <vscale x 2 x double>
- %nxv4_f16_to_f64 = fpext <vscale x 4 x bfloat> undef to <vscale x 4 x double>
- %nxv8_f16_to_f64 = fpext <vscale x 8 x bfloat> undef to <vscale x 8 x double>
+ %nxv2_f16_to_f64 = fpext <vscale x 2 x bfloat> poison to <vscale x 2 x double>
+ %nxv4_f16_to_f64 = fpext <vscale x 4 x bfloat> poison to <vscale x 4 x double>
+ %nxv8_f16_to_f64 = fpext <vscale x 8 x bfloat> poison to <vscale x 8 x double>
ret void
}
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-fptoi.ll b/llvm/test/Analysis/CostModel/AArch64/sve-fptoi.ll
index 06ed58d..ce624a1 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-fptoi.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-fptoi.ll
@@ -6,163 +6,163 @@ target triple = "aarch64-unknown-linux-gnu"
define void @sve-fptoi() {
; CHECK-LABEL: 'sve-fptoi'
-; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f16_to_si8 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i8>
-; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f16_to_ui8 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i8>
-; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f16_to_si32 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i32>
-; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f16_to_ui32 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i32>
-; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f16_to_si64 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f16_to_ui64 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f32_to_si8 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i8>
-; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f32_to_ui8 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i8>
-; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f32_to_si16 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i16>
-; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f32_to_ui16 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i16>
-; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f32_to_si64 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f32_to_ui64 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i64>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv1f64_to_si8 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i8>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv1f64_to_ui8 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i8>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv1f64_to_si16 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i16>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv1f64_to_ui16 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i16>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv1f64_to_si32 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i32>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv1f64_to_ui32 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i32>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f16_to_si8 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i8>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f16_to_ui8 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i8>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f16_to_si32 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i32>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f16_to_ui32 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i32>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f16_to_si64 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i64>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f16_to_ui64 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i64>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f32_to_si8 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i8>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f32_to_ui8 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i8>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f32_to_si16 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i16>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f32_to_ui16 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i16>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f32_to_si64 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i64>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f32_to_ui64 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i64>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f64_to_si8 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i8>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f64_to_ui8 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i8>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f64_to_si16 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i16>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f64_to_ui16 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i16>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f64_to_si32 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i32>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f64_to_ui32 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i32>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv4f16_to_si8 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i8>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv4f16_to_ui8 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i8>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv4f16_to_si32 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i32>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv4f16_to_ui32 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i32>
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f16_to_si64 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f16_to_ui64 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i64>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv4f32_to_si8 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i8>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv4f32_to_ui8 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i8>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv4f32_to_si16 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i16>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv4f32_to_ui16 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i16>
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f32_to_si64 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f32_to_ui64 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f64_to_si8 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i8>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f64_to_ui8 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i8>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f64_to_si16 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i16>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f64_to_ui16 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i16>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f64_to_si32 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i32>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f64_to_ui32 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i32>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv8f16_to_si8 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i8>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv8f16_to_ui8 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i8>
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f16_to_si32 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i32>
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f16_to_ui32 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i32>
-; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f16_to_si64 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f16_to_ui64 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f32_to_si8 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i8>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f32_to_ui8 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i8>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f32_to_si16 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i16>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f32_to_ui16 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i16>
-; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f32_to_si64 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f32_to_ui64 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i64>
-; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f64_to_si8 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i8>
-; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f64_to_ui8 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i8>
-; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f64_to_si16 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i16>
-; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f64_to_ui16 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i16>
-; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f64_to_si32 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i32>
-; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f64_to_ui32 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i32>
+; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f16_to_si8 = fptosi <vscale x 1 x half> poison to <vscale x 1 x i8>
+; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f16_to_ui8 = fptoui <vscale x 1 x half> poison to <vscale x 1 x i8>
+; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f16_to_si32 = fptosi <vscale x 1 x half> poison to <vscale x 1 x i32>
+; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f16_to_ui32 = fptoui <vscale x 1 x half> poison to <vscale x 1 x i32>
+; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f16_to_si64 = fptosi <vscale x 1 x half> poison to <vscale x 1 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f16_to_ui64 = fptoui <vscale x 1 x half> poison to <vscale x 1 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f32_to_si8 = fptosi <vscale x 1 x float> poison to <vscale x 1 x i8>
+; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f32_to_ui8 = fptoui <vscale x 1 x float> poison to <vscale x 1 x i8>
+; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f32_to_si16 = fptosi <vscale x 1 x float> poison to <vscale x 1 x i16>
+; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f32_to_ui16 = fptoui <vscale x 1 x float> poison to <vscale x 1 x i16>
+; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f32_to_si64 = fptosi <vscale x 1 x float> poison to <vscale x 1 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f32_to_ui64 = fptoui <vscale x 1 x float> poison to <vscale x 1 x i64>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv1f64_to_si8 = fptosi <vscale x 1 x double> poison to <vscale x 1 x i8>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv1f64_to_ui8 = fptoui <vscale x 1 x double> poison to <vscale x 1 x i8>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv1f64_to_si16 = fptosi <vscale x 1 x double> poison to <vscale x 1 x i16>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv1f64_to_ui16 = fptoui <vscale x 1 x double> poison to <vscale x 1 x i16>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv1f64_to_si32 = fptosi <vscale x 1 x double> poison to <vscale x 1 x i32>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv1f64_to_ui32 = fptoui <vscale x 1 x double> poison to <vscale x 1 x i32>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f16_to_si8 = fptosi <vscale x 2 x half> poison to <vscale x 2 x i8>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f16_to_ui8 = fptoui <vscale x 2 x half> poison to <vscale x 2 x i8>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f16_to_si32 = fptosi <vscale x 2 x half> poison to <vscale x 2 x i32>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f16_to_ui32 = fptoui <vscale x 2 x half> poison to <vscale x 2 x i32>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f16_to_si64 = fptosi <vscale x 2 x half> poison to <vscale x 2 x i64>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f16_to_ui64 = fptoui <vscale x 2 x half> poison to <vscale x 2 x i64>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f32_to_si8 = fptosi <vscale x 2 x float> poison to <vscale x 2 x i8>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f32_to_ui8 = fptoui <vscale x 2 x float> poison to <vscale x 2 x i8>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f32_to_si16 = fptosi <vscale x 2 x float> poison to <vscale x 2 x i16>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f32_to_ui16 = fptoui <vscale x 2 x float> poison to <vscale x 2 x i16>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f32_to_si64 = fptosi <vscale x 2 x float> poison to <vscale x 2 x i64>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f32_to_ui64 = fptoui <vscale x 2 x float> poison to <vscale x 2 x i64>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f64_to_si8 = fptosi <vscale x 2 x double> poison to <vscale x 2 x i8>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f64_to_ui8 = fptoui <vscale x 2 x double> poison to <vscale x 2 x i8>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f64_to_si16 = fptosi <vscale x 2 x double> poison to <vscale x 2 x i16>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f64_to_ui16 = fptoui <vscale x 2 x double> poison to <vscale x 2 x i16>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f64_to_si32 = fptosi <vscale x 2 x double> poison to <vscale x 2 x i32>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f64_to_ui32 = fptoui <vscale x 2 x double> poison to <vscale x 2 x i32>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv4f16_to_si8 = fptosi <vscale x 4 x half> poison to <vscale x 4 x i8>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv4f16_to_ui8 = fptoui <vscale x 4 x half> poison to <vscale x 4 x i8>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv4f16_to_si32 = fptosi <vscale x 4 x half> poison to <vscale x 4 x i32>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv4f16_to_ui32 = fptoui <vscale x 4 x half> poison to <vscale x 4 x i32>
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f16_to_si64 = fptosi <vscale x 4 x half> poison to <vscale x 4 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f16_to_ui64 = fptoui <vscale x 4 x half> poison to <vscale x 4 x i64>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv4f32_to_si8 = fptosi <vscale x 4 x float> poison to <vscale x 4 x i8>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv4f32_to_ui8 = fptoui <vscale x 4 x float> poison to <vscale x 4 x i8>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv4f32_to_si16 = fptosi <vscale x 4 x float> poison to <vscale x 4 x i16>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv4f32_to_ui16 = fptoui <vscale x 4 x float> poison to <vscale x 4 x i16>
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f32_to_si64 = fptosi <vscale x 4 x float> poison to <vscale x 4 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f32_to_ui64 = fptoui <vscale x 4 x float> poison to <vscale x 4 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f64_to_si8 = fptosi <vscale x 4 x double> poison to <vscale x 4 x i8>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f64_to_ui8 = fptoui <vscale x 4 x double> poison to <vscale x 4 x i8>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f64_to_si16 = fptosi <vscale x 4 x double> poison to <vscale x 4 x i16>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f64_to_ui16 = fptoui <vscale x 4 x double> poison to <vscale x 4 x i16>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f64_to_si32 = fptosi <vscale x 4 x double> poison to <vscale x 4 x i32>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f64_to_ui32 = fptoui <vscale x 4 x double> poison to <vscale x 4 x i32>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv8f16_to_si8 = fptosi <vscale x 8 x half> poison to <vscale x 8 x i8>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv8f16_to_ui8 = fptoui <vscale x 8 x half> poison to <vscale x 8 x i8>
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f16_to_si32 = fptosi <vscale x 8 x half> poison to <vscale x 8 x i32>
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f16_to_ui32 = fptoui <vscale x 8 x half> poison to <vscale x 8 x i32>
+; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f16_to_si64 = fptosi <vscale x 8 x half> poison to <vscale x 8 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f16_to_ui64 = fptoui <vscale x 8 x half> poison to <vscale x 8 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f32_to_si8 = fptosi <vscale x 8 x float> poison to <vscale x 8 x i8>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f32_to_ui8 = fptoui <vscale x 8 x float> poison to <vscale x 8 x i8>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f32_to_si16 = fptosi <vscale x 8 x float> poison to <vscale x 8 x i16>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f32_to_ui16 = fptoui <vscale x 8 x float> poison to <vscale x 8 x i16>
+; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f32_to_si64 = fptosi <vscale x 8 x float> poison to <vscale x 8 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f32_to_ui64 = fptoui <vscale x 8 x float> poison to <vscale x 8 x i64>
+; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f64_to_si8 = fptosi <vscale x 8 x double> poison to <vscale x 8 x i8>
+; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f64_to_ui8 = fptoui <vscale x 8 x double> poison to <vscale x 8 x i8>
+; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f64_to_si16 = fptosi <vscale x 8 x double> poison to <vscale x 8 x i16>
+; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f64_to_ui16 = fptoui <vscale x 8 x double> poison to <vscale x 8 x i16>
+; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f64_to_si32 = fptosi <vscale x 8 x double> poison to <vscale x 8 x i32>
+; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f64_to_ui32 = fptoui <vscale x 8 x double> poison to <vscale x 8 x i32>
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
- %nv1f16_to_si8 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i8>
- %nv1f16_to_ui8 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i8>
- %nv1f16_to_si32 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i32>
- %nv1f16_to_ui32 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i32>
- %nv1f16_to_si64 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i64>
- %nv1f16_to_ui64 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i64>
+ %nv1f16_to_si8 = fptosi <vscale x 1 x half> poison to <vscale x 1 x i8>
+ %nv1f16_to_ui8 = fptoui <vscale x 1 x half> poison to <vscale x 1 x i8>
+ %nv1f16_to_si32 = fptosi <vscale x 1 x half> poison to <vscale x 1 x i32>
+ %nv1f16_to_ui32 = fptoui <vscale x 1 x half> poison to <vscale x 1 x i32>
+ %nv1f16_to_si64 = fptosi <vscale x 1 x half> poison to <vscale x 1 x i64>
+ %nv1f16_to_ui64 = fptoui <vscale x 1 x half> poison to <vscale x 1 x i64>
- %nv1f32_to_si8 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i8>
- %nv1f32_to_ui8 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i8>
- %nv1f32_to_si16 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i16>
- %nv1f32_to_ui16 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i16>
- %nv1f32_to_si64 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i64>
- %nv1f32_to_ui64 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i64>
+ %nv1f32_to_si8 = fptosi <vscale x 1 x float> poison to <vscale x 1 x i8>
+ %nv1f32_to_ui8 = fptoui <vscale x 1 x float> poison to <vscale x 1 x i8>
+ %nv1f32_to_si16 = fptosi <vscale x 1 x float> poison to <vscale x 1 x i16>
+ %nv1f32_to_ui16 = fptoui <vscale x 1 x float> poison to <vscale x 1 x i16>
+ %nv1f32_to_si64 = fptosi <vscale x 1 x float> poison to <vscale x 1 x i64>
+ %nv1f32_to_ui64 = fptoui <vscale x 1 x float> poison to <vscale x 1 x i64>
- %nv1f64_to_si8 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i8>
- %nv1f64_to_ui8 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i8>
- %nv1f64_to_si16 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i16>
- %nv1f64_to_ui16 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i16>
- %nv1f64_to_si32 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i32>
- %nv1f64_to_ui32 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i32>
+ %nv1f64_to_si8 = fptosi <vscale x 1 x double> poison to <vscale x 1 x i8>
+ %nv1f64_to_ui8 = fptoui <vscale x 1 x double> poison to <vscale x 1 x i8>
+ %nv1f64_to_si16 = fptosi <vscale x 1 x double> poison to <vscale x 1 x i16>
+ %nv1f64_to_ui16 = fptoui <vscale x 1 x double> poison to <vscale x 1 x i16>
+ %nv1f64_to_si32 = fptosi <vscale x 1 x double> poison to <vscale x 1 x i32>
+ %nv1f64_to_ui32 = fptoui <vscale x 1 x double> poison to <vscale x 1 x i32>
- %nv2f16_to_si8 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i8>
- %nv2f16_to_ui8 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i8>
- %nv2f16_to_si32 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i32>
- %nv2f16_to_ui32 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i32>
- %nv2f16_to_si64 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i64>
- %nv2f16_to_ui64 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i64>
+ %nv2f16_to_si8 = fptosi <vscale x 2 x half> poison to <vscale x 2 x i8>
+ %nv2f16_to_ui8 = fptoui <vscale x 2 x half> poison to <vscale x 2 x i8>
+ %nv2f16_to_si32 = fptosi <vscale x 2 x half> poison to <vscale x 2 x i32>
+ %nv2f16_to_ui32 = fptoui <vscale x 2 x half> poison to <vscale x 2 x i32>
+ %nv2f16_to_si64 = fptosi <vscale x 2 x half> poison to <vscale x 2 x i64>
+ %nv2f16_to_ui64 = fptoui <vscale x 2 x half> poison to <vscale x 2 x i64>
- %nv2f32_to_si8 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i8>
- %nv2f32_to_ui8 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i8>
- %nv2f32_to_si16 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i16>
- %nv2f32_to_ui16 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i16>
- %nv2f32_to_si64 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i64>
- %nv2f32_to_ui64 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i64>
+ %nv2f32_to_si8 = fptosi <vscale x 2 x float> poison to <vscale x 2 x i8>
+ %nv2f32_to_ui8 = fptoui <vscale x 2 x float> poison to <vscale x 2 x i8>
+ %nv2f32_to_si16 = fptosi <vscale x 2 x float> poison to <vscale x 2 x i16>
+ %nv2f32_to_ui16 = fptoui <vscale x 2 x float> poison to <vscale x 2 x i16>
+ %nv2f32_to_si64 = fptosi <vscale x 2 x float> poison to <vscale x 2 x i64>
+ %nv2f32_to_ui64 = fptoui <vscale x 2 x float> poison to <vscale x 2 x i64>
- %nv2f64_to_si8 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i8>
- %nv2f64_to_ui8 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i8>
- %nv2f64_to_si16 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i16>
- %nv2f64_to_ui16 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i16>
- %nv2f64_to_si32 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i32>
- %nv2f64_to_ui32 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i32>
+ %nv2f64_to_si8 = fptosi <vscale x 2 x double> poison to <vscale x 2 x i8>
+ %nv2f64_to_ui8 = fptoui <vscale x 2 x double> poison to <vscale x 2 x i8>
+ %nv2f64_to_si16 = fptosi <vscale x 2 x double> poison to <vscale x 2 x i16>
+ %nv2f64_to_ui16 = fptoui <vscale x 2 x double> poison to <vscale x 2 x i16>
+ %nv2f64_to_si32 = fptosi <vscale x 2 x double> poison to <vscale x 2 x i32>
+ %nv2f64_to_ui32 = fptoui <vscale x 2 x double> poison to <vscale x 2 x i32>
- %nv4f16_to_si8 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i8>
- %nv4f16_to_ui8 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i8>
- %nv4f16_to_si32 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i32>
- %nv4f16_to_ui32 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i32>
- %nv4f16_to_si64 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i64>
- %nv4f16_to_ui64 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i64>
+ %nv4f16_to_si8 = fptosi <vscale x 4 x half> poison to <vscale x 4 x i8>
+ %nv4f16_to_ui8 = fptoui <vscale x 4 x half> poison to <vscale x 4 x i8>
+ %nv4f16_to_si32 = fptosi <vscale x 4 x half> poison to <vscale x 4 x i32>
+ %nv4f16_to_ui32 = fptoui <vscale x 4 x half> poison to <vscale x 4 x i32>
+ %nv4f16_to_si64 = fptosi <vscale x 4 x half> poison to <vscale x 4 x i64>
+ %nv4f16_to_ui64 = fptoui <vscale x 4 x half> poison to <vscale x 4 x i64>
- %nv4f32_to_si8 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i8>
- %nv4f32_to_ui8 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i8>
- %nv4f32_to_si16 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i16>
- %nv4f32_to_ui16 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i16>
- %nv4f32_to_si64 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i64>
- %nv4f32_to_ui64 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i64>
+ %nv4f32_to_si8 = fptosi <vscale x 4 x float> poison to <vscale x 4 x i8>
+ %nv4f32_to_ui8 = fptoui <vscale x 4 x float> poison to <vscale x 4 x i8>
+ %nv4f32_to_si16 = fptosi <vscale x 4 x float> poison to <vscale x 4 x i16>
+ %nv4f32_to_ui16 = fptoui <vscale x 4 x float> poison to <vscale x 4 x i16>
+ %nv4f32_to_si64 = fptosi <vscale x 4 x float> poison to <vscale x 4 x i64>
+ %nv4f32_to_ui64 = fptoui <vscale x 4 x float> poison to <vscale x 4 x i64>
- %nv4f64_to_si8 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i8>
- %nv4f64_to_ui8 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i8>
- %nv4f64_to_si16 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i16>
- %nv4f64_to_ui16 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i16>
- %nv4f64_to_si32 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i32>
- %nv4f64_to_ui32 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i32>
+ %nv4f64_to_si8 = fptosi <vscale x 4 x double> poison to <vscale x 4 x i8>
+ %nv4f64_to_ui8 = fptoui <vscale x 4 x double> poison to <vscale x 4 x i8>
+ %nv4f64_to_si16 = fptosi <vscale x 4 x double> poison to <vscale x 4 x i16>
+ %nv4f64_to_ui16 = fptoui <vscale x 4 x double> poison to <vscale x 4 x i16>
+ %nv4f64_to_si32 = fptosi <vscale x 4 x double> poison to <vscale x 4 x i32>
+ %nv4f64_to_ui32 = fptoui <vscale x 4 x double> poison to <vscale x 4 x i32>
- %nv8f16_to_si8 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i8>
- %nv8f16_to_ui8 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i8>
- %nv8f16_to_si32 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i32>
- %nv8f16_to_ui32 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i32>
- %nv8f16_to_si64 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i64>
- %nv8f16_to_ui64 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i64>
+ %nv8f16_to_si8 = fptosi <vscale x 8 x half> poison to <vscale x 8 x i8>
+ %nv8f16_to_ui8 = fptoui <vscale x 8 x half> poison to <vscale x 8 x i8>
+ %nv8f16_to_si32 = fptosi <vscale x 8 x half> poison to <vscale x 8 x i32>
+ %nv8f16_to_ui32 = fptoui <vscale x 8 x half> poison to <vscale x 8 x i32>
+ %nv8f16_to_si64 = fptosi <vscale x 8 x half> poison to <vscale x 8 x i64>
+ %nv8f16_to_ui64 = fptoui <vscale x 8 x half> poison to <vscale x 8 x i64>
- %nv8f32_to_si8 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i8>
- %nv8f32_to_ui8 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i8>
- %nv8f32_to_si16 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i16>
- %nv8f32_to_ui16 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i16>
- %nv8f32_to_si64 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i64>
- %nv8f32_to_ui64 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i64>
+ %nv8f32_to_si8 = fptosi <vscale x 8 x float> poison to <vscale x 8 x i8>
+ %nv8f32_to_ui8 = fptoui <vscale x 8 x float> poison to <vscale x 8 x i8>
+ %nv8f32_to_si16 = fptosi <vscale x 8 x float> poison to <vscale x 8 x i16>
+ %nv8f32_to_ui16 = fptoui <vscale x 8 x float> poison to <vscale x 8 x i16>
+ %nv8f32_to_si64 = fptosi <vscale x 8 x float> poison to <vscale x 8 x i64>
+ %nv8f32_to_ui64 = fptoui <vscale x 8 x float> poison to <vscale x 8 x i64>
- %nv8f64_to_si8 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i8>
- %nv8f64_to_ui8 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i8>
- %nv8f64_to_si16 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i16>
- %nv8f64_to_ui16 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i16>
- %nv8f64_to_si32 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i32>
- %nv8f64_to_ui32 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i32>
+ %nv8f64_to_si8 = fptosi <vscale x 8 x double> poison to <vscale x 8 x i8>
+ %nv8f64_to_ui8 = fptoui <vscale x 8 x double> poison to <vscale x 8 x i8>
+ %nv8f64_to_si16 = fptosi <vscale x 8 x double> poison to <vscale x 8 x i16>
+ %nv8f64_to_ui16 = fptoui <vscale x 8 x double> poison to <vscale x 8 x i16>
+ %nv8f64_to_si32 = fptosi <vscale x 8 x double> poison to <vscale x 8 x i32>
+ %nv8f64_to_ui32 = fptoui <vscale x 8 x double> poison to <vscale x 8 x i32>
ret void
}
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-fptrunc.ll b/llvm/test/Analysis/CostModel/AArch64/sve-fptrunc.ll
index 73556d7e..5b30c33 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-fptrunc.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-fptrunc.ll
@@ -8,67 +8,67 @@ target triple = "aarch64-unknown-linux-gnu"
define void @sve_fptruncs() {
; CHECK-LABEL: 'sve_fptruncs'
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nxv2_f16_from_f32 = fptrunc <vscale x 2 x float> undef to <vscale x 2 x half>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nxv4_f16_from_f32 = fptrunc <vscale x 4 x float> undef to <vscale x 4 x half>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_from_f32 = fptrunc <vscale x 8 x float> undef to <vscale x 8 x half>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nxv2_f16_from_f64 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x half>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_from_f64 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x half>
-; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_from_f64 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x half>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %nxv2_f32_from_f64 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f32_from_f64 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f32_from_f64 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x float>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nxv2_f16_from_f32 = fptrunc <vscale x 2 x float> poison to <vscale x 2 x half>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nxv4_f16_from_f32 = fptrunc <vscale x 4 x float> poison to <vscale x 4 x half>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_from_f32 = fptrunc <vscale x 8 x float> poison to <vscale x 8 x half>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nxv2_f16_from_f64 = fptrunc <vscale x 2 x double> poison to <vscale x 2 x half>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_from_f64 = fptrunc <vscale x 4 x double> poison to <vscale x 4 x half>
+; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_from_f64 = fptrunc <vscale x 8 x double> poison to <vscale x 8 x half>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nxv2_f32_from_f64 = fptrunc <vscale x 2 x double> poison to <vscale x 2 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f32_from_f64 = fptrunc <vscale x 4 x double> poison to <vscale x 4 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f32_from_f64 = fptrunc <vscale x 8 x double> poison to <vscale x 8 x float>
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
- %nxv2_f16_from_f32 = fptrunc <vscale x 2 x float> undef to <vscale x 2 x half>
- %nxv4_f16_from_f32 = fptrunc <vscale x 4 x float> undef to <vscale x 4 x half>
- %nxv8_f16_from_f32 = fptrunc <vscale x 8 x float> undef to <vscale x 8 x half>
+ %nxv2_f16_from_f32 = fptrunc <vscale x 2 x float> poison to <vscale x 2 x half>
+ %nxv4_f16_from_f32 = fptrunc <vscale x 4 x float> poison to <vscale x 4 x half>
+ %nxv8_f16_from_f32 = fptrunc <vscale x 8 x float> poison to <vscale x 8 x half>
- %nxv2_f16_from_f64 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x half>
- %nxv4_f16_from_f64 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x half>
- %nxv8_f16_from_f64 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x half>
+ %nxv2_f16_from_f64 = fptrunc <vscale x 2 x double> poison to <vscale x 2 x half>
+ %nxv4_f16_from_f64 = fptrunc <vscale x 4 x double> poison to <vscale x 4 x half>
+ %nxv8_f16_from_f64 = fptrunc <vscale x 8 x double> poison to <vscale x 8 x half>
- %nxv2_f32_from_f64 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x float>
- %nxv4_f32_from_f64 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x float>
- %nxv8_f32_from_f64 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x float>
+ %nxv2_f32_from_f64 = fptrunc <vscale x 2 x double> poison to <vscale x 2 x float>
+ %nxv4_f32_from_f64 = fptrunc <vscale x 4 x double> poison to <vscale x 4 x float>
+ %nxv8_f32_from_f64 = fptrunc <vscale x 8 x double> poison to <vscale x 8 x float>
ret void
}
define void @sve_fptruncs_bf16() {
; CHECK-SVE-LABEL: 'sve_fptruncs_bf16'
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %nxv2_f16_from_f32 = fptrunc <vscale x 2 x float> undef to <vscale x 2 x bfloat>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_from_f32 = fptrunc <vscale x 4 x float> undef to <vscale x 4 x bfloat>
-; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:17 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_from_f32 = fptrunc <vscale x 8 x float> undef to <vscale x 8 x bfloat>
-; CHECK-SVE-NEXT: Cost Model: Found costs of Invalid for: %nxv2_f16_from_f64 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x bfloat>
-; CHECK-SVE-NEXT: Cost Model: Found costs of Invalid for: %nxv4_f16_from_f64 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x bfloat>
-; CHECK-SVE-NEXT: Cost Model: Found costs of Invalid for: %nxv8_f16_from_f64 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x bfloat>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %nxv2_f16_from_f32 = fptrunc <vscale x 2 x float> poison to <vscale x 2 x bfloat>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_from_f32 = fptrunc <vscale x 4 x float> poison to <vscale x 4 x bfloat>
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:17 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_from_f32 = fptrunc <vscale x 8 x float> poison to <vscale x 8 x bfloat>
+; CHECK-SVE-NEXT: Cost Model: Found costs of Invalid for: %nxv2_f16_from_f64 = fptrunc <vscale x 2 x double> poison to <vscale x 2 x bfloat>
+; CHECK-SVE-NEXT: Cost Model: Found costs of Invalid for: %nxv4_f16_from_f64 = fptrunc <vscale x 4 x double> poison to <vscale x 4 x bfloat>
+; CHECK-SVE-NEXT: Cost Model: Found costs of Invalid for: %nxv8_f16_from_f64 = fptrunc <vscale x 8 x double> poison to <vscale x 8 x bfloat>
; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-SVE2-LABEL: 'sve_fptruncs_bf16'
-; CHECK-SVE2-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %nxv2_f16_from_f32 = fptrunc <vscale x 2 x float> undef to <vscale x 2 x bfloat>
-; CHECK-SVE2-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_from_f32 = fptrunc <vscale x 4 x float> undef to <vscale x 4 x bfloat>
-; CHECK-SVE2-NEXT: Cost Model: Found costs of RThru:17 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_from_f32 = fptrunc <vscale x 8 x float> undef to <vscale x 8 x bfloat>
-; CHECK-SVE2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %nxv2_f16_from_f64 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x bfloat>
-; CHECK-SVE2-NEXT: Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_from_f64 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x bfloat>
-; CHECK-SVE2-NEXT: Cost Model: Found costs of RThru:39 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_from_f64 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x bfloat>
+; CHECK-SVE2-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %nxv2_f16_from_f32 = fptrunc <vscale x 2 x float> poison to <vscale x 2 x bfloat>
+; CHECK-SVE2-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_from_f32 = fptrunc <vscale x 4 x float> poison to <vscale x 4 x bfloat>
+; CHECK-SVE2-NEXT: Cost Model: Found costs of RThru:17 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_from_f32 = fptrunc <vscale x 8 x float> poison to <vscale x 8 x bfloat>
+; CHECK-SVE2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %nxv2_f16_from_f64 = fptrunc <vscale x 2 x double> poison to <vscale x 2 x bfloat>
+; CHECK-SVE2-NEXT: Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_from_f64 = fptrunc <vscale x 4 x double> poison to <vscale x 4 x bfloat>
+; CHECK-SVE2-NEXT: Cost Model: Found costs of RThru:39 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_from_f64 = fptrunc <vscale x 8 x double> poison to <vscale x 8 x bfloat>
; CHECK-SVE2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-BF16-LABEL: 'sve_fptruncs_bf16'
-; CHECK-BF16-NEXT: Cost Model: Found costs of 1 for: %nxv2_f16_from_f32 = fptrunc <vscale x 2 x float> undef to <vscale x 2 x bfloat>
-; CHECK-BF16-NEXT: Cost Model: Found costs of 1 for: %nxv4_f16_from_f32 = fptrunc <vscale x 4 x float> undef to <vscale x 4 x bfloat>
-; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_from_f32 = fptrunc <vscale x 8 x float> undef to <vscale x 8 x bfloat>
-; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nxv2_f16_from_f64 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x bfloat>
-; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_from_f64 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x bfloat>
-; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_from_f64 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x bfloat>
+; CHECK-BF16-NEXT: Cost Model: Found costs of 1 for: %nxv2_f16_from_f32 = fptrunc <vscale x 2 x float> poison to <vscale x 2 x bfloat>
+; CHECK-BF16-NEXT: Cost Model: Found costs of 1 for: %nxv4_f16_from_f32 = fptrunc <vscale x 4 x float> poison to <vscale x 4 x bfloat>
+; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_from_f32 = fptrunc <vscale x 8 x float> poison to <vscale x 8 x bfloat>
+; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nxv2_f16_from_f64 = fptrunc <vscale x 2 x double> poison to <vscale x 2 x bfloat>
+; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_from_f64 = fptrunc <vscale x 4 x double> poison to <vscale x 4 x bfloat>
+; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_from_f64 = fptrunc <vscale x 8 x double> poison to <vscale x 8 x bfloat>
; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
- %nxv2_f16_from_f32 = fptrunc <vscale x 2 x float> undef to <vscale x 2 x bfloat>
- %nxv4_f16_from_f32 = fptrunc <vscale x 4 x float> undef to <vscale x 4 x bfloat>
- %nxv8_f16_from_f32 = fptrunc <vscale x 8 x float> undef to <vscale x 8 x bfloat>
+ %nxv2_f16_from_f32 = fptrunc <vscale x 2 x float> poison to <vscale x 2 x bfloat>
+ %nxv4_f16_from_f32 = fptrunc <vscale x 4 x float> poison to <vscale x 4 x bfloat>
+ %nxv8_f16_from_f32 = fptrunc <vscale x 8 x float> poison to <vscale x 8 x bfloat>
- %nxv2_f16_from_f64 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x bfloat>
- %nxv4_f16_from_f64 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x bfloat>
- %nxv8_f16_from_f64 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x bfloat>
+ %nxv2_f16_from_f64 = fptrunc <vscale x 2 x double> poison to <vscale x 2 x bfloat>
+ %nxv4_f16_from_f64 = fptrunc <vscale x 4 x double> poison to <vscale x 4 x bfloat>
+ %nxv8_f16_from_f64 = fptrunc <vscale x 8 x double> poison to <vscale x 8 x bfloat>
ret void
}
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-illegal-types.ll b/llvm/test/Analysis/CostModel/AArch64/sve-illegal-types.ll
index 3e85760..2c838e2 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-illegal-types.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-illegal-types.ll
@@ -1,14 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
define void @load_store(ptr %ptrs) {
; CHECK-LABEL: 'load_store'
-; CHECK-NEXT: Invalid cost for instruction: %load1 = load <vscale x 1 x i128>, ptr undef
-; CHECK-NEXT: Invalid cost for instruction: %load2 = load <vscale x 2 x i128>, ptr undef
-; CHECK-NEXT: Invalid cost for instruction: %load3 = load <vscale x 1 x fp128>, ptr undef
-; CHECK-NEXT: Invalid cost for instruction: %load4 = load <vscale x 2 x fp128>, ptr undef
-; CHECK-NEXT: Invalid cost for instruction: store <vscale x 1 x i128> %load1, ptr %ptrs
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %load1 = load <vscale x 1 x i128>, ptr undef, align 16
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %load2 = load <vscale x 2 x i128>, ptr undef, align 32
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %load3 = load <vscale x 1 x fp128>, ptr undef, align 16
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %load4 = load <vscale x 2 x fp128>, ptr undef, align 32
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: store <vscale x 1 x i128> %load1, ptr %ptrs, align 16
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
%load1 = load <vscale x 1 x i128>, ptr undef
%load2 = load <vscale x 2 x i128>, ptr undef
%load3 = load <vscale x 1 x fp128>, ptr undef
@@ -19,8 +22,10 @@ define void @load_store(ptr %ptrs) {
define void @masked_load_store(ptr %ptrs, ptr %val, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru) {
; CHECK-LABEL: 'masked_load_store'
-; CHECK-NEXT: Invalid cost for instruction: %mload = call <vscale x 1 x i128> @llvm.masked.load.nxv1i128.p0(ptr %val, i32 8, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru)
-; CHECK-NEXT: Invalid cost for instruction: call void @llvm.masked.store.nxv1i128.p0(<vscale x 1 x i128> %mload, ptr %ptrs, i32 8, <vscale x 1 x i1> %mask)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %mload = call <vscale x 1 x i128> @llvm.masked.load.nxv1i128.p0(ptr %val, i32 8, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.masked.store.nxv1i128.p0(<vscale x 1 x i128> %mload, ptr %ptrs, i32 8, <vscale x 1 x i1> %mask)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
%mload = call <vscale x 1 x i128> @llvm.masked.load.nxv1i128(ptr %val, i32 8, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru)
call void @llvm.masked.store.nxv1i128(<vscale x 1 x i128> %mload, ptr %ptrs, i32 8, <vscale x 1 x i1> %mask)
ret void
@@ -28,8 +33,10 @@ define void @masked_load_store(ptr %ptrs, ptr %val, <vscale x 1 x i1> %mask, <vs
define void @masked_gather_scatter(<vscale x 1 x ptr> %ptrs, <vscale x 1 x ptr> %val, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru) {
; CHECK-LABEL: 'masked_gather_scatter'
-; CHECK-NEXT: Invalid cost for instruction: %mgather = call <vscale x 1 x i128> @llvm.masked.gather.nxv1i128.nxv1p0(<vscale x 1 x ptr> %val, i32 0, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru)
-; CHECK-NEXT: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i128.nxv1p0(<vscale x 1 x i128> %mgather, <vscale x 1 x ptr> %ptrs, i32 0, <vscale x 1 x i1> %mask)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: %mgather = call <vscale x 1 x i128> @llvm.masked.gather.nxv1i128.nxv1p0(<vscale x 1 x ptr> %val, i32 0, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru)
+; CHECK-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i128.nxv1p0(<vscale x 1 x i128> %mgather, <vscale x 1 x ptr> %ptrs, i32 0, <vscale x 1 x i1> %mask)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
%mgather = call <vscale x 1 x i128> @llvm.masked.gather.nxv1i128(<vscale x 1 x ptr> %val, i32 0, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru)
call void @llvm.masked.scatter.nxv1i128(<vscale x 1 x i128> %mgather, <vscale x 1 x ptr> %ptrs, i32 0, <vscale x 1 x i1> %mask)
ret void
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
index 609a23b..0976a10 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
@@ -40,50 +40,50 @@ declare <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x
define void @vector_insert_extract_idxzero_128b() #1 {
; CHECK-VSCALE-1-LABEL: 'vector_insert_extract_idxzero_128b'
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 0 for: %insert_legal_fixed_into_scalable = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> undef, i64 0)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> undef, i64 0)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:48 CodeSize:32 Lat:48 SizeLat:48 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 0 for: %insert_legal_fixed_into_scalable = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> poison, <4 x float> poison, i64 0)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> poison, i64 0)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> poison, <vscale x 2 x i1> poison, i64 0)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> poison, i64 0)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:48 CodeSize:32 Lat:48 SizeLat:48 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> poison, i64 0)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> poison, <2 x float> poison, i64 0)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> poison, i64 0)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> poison, <vscale x 2 x float> poison, i64 0)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> poison, i64 0)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-VSCALE-2-LABEL: 'vector_insert_extract_idxzero_128b'
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 0 for: %insert_legal_fixed_into_scalable = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> undef, i64 0)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> undef, i64 0)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:48 CodeSize:32 Lat:48 SizeLat:48 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 0 for: %insert_legal_fixed_into_scalable = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> poison, <4 x float> poison, i64 0)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> poison, i64 0)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> poison, <vscale x 2 x i1> poison, i64 0)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> poison, i64 0)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:48 CodeSize:32 Lat:48 SizeLat:48 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> poison, i64 0)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> poison, <2 x float> poison, i64 0)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> poison, i64 0)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> poison, <vscale x 2 x float> poison, i64 0)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> poison, i64 0)
; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; TYPE_BASED_ONLY-LABEL: 'vector_insert_extract_idxzero_128b'
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %insert_legal_fixed_into_scalable = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> undef, i64 0)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> undef, i64 0)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %insert_legal_fixed_into_scalable = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> poison, <4 x float> poison, i64 0)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> poison, i64 0)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> poison, <vscale x 2 x i1> poison, i64 0)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> poison, i64 0)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> poison, i64 0)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> poison, <2 x float> poison, i64 0)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> poison, i64 0)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> poison, <vscale x 2 x float> poison, i64 0)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> poison, i64 0)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
- %insert_legal_fixed_into_scalable = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> undef, i64 0)
- %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> undef, i64 0)
- %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.v2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
- %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
- %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
- %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
- %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
- %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
- %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0)
+ %insert_legal_fixed_into_scalable = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> poison, <4 x float> poison, i64 0)
+ %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> poison, i64 0)
+ %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.v2i1(<vscale x 16 x i1> poison, <vscale x 2 x i1> poison, i64 0)
+ %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> poison, i64 0)
+ %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> poison, i64 0)
+ %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> poison, <2 x float> poison, i64 0)
+ %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> poison, i64 0)
+ %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> poison, <vscale x 2 x float> poison, i64 0)
+ %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> poison, i64 0)
ret void
}
declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float>, <4 x float>, i64)
@@ -97,50 +97,50 @@ declare <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x
define void @vector_insert_extract_idxzero_256b() #2 {
; CHECK-VSCALE-1-LABEL: 'vector_insert_extract_idxzero_256b'
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 0 for: %insert_legal_fixed_into_scalable = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> undef, <16 x i16> undef, i64 0)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> undef, i64 0)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:48 CodeSize:32 Lat:48 SizeLat:48 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 0 for: %insert_legal_fixed_into_scalable = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> poison, <16 x i16> poison, i64 0)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> poison, i64 0)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> poison, <vscale x 2 x i1> poison, i64 0)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> poison, i64 0)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:48 CodeSize:32 Lat:48 SizeLat:48 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> poison, i64 0)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> poison, <2 x float> poison, i64 0)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> poison, i64 0)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> poison, <vscale x 2 x float> poison, i64 0)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> poison, i64 0)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-VSCALE-2-LABEL: 'vector_insert_extract_idxzero_256b'
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 0 for: %insert_legal_fixed_into_scalable = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> undef, <16 x i16> undef, i64 0)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> undef, i64 0)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:48 CodeSize:32 Lat:48 SizeLat:48 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 0 for: %insert_legal_fixed_into_scalable = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> poison, <16 x i16> poison, i64 0)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> poison, i64 0)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> poison, <vscale x 2 x i1> poison, i64 0)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> poison, i64 0)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:48 CodeSize:32 Lat:48 SizeLat:48 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> poison, i64 0)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> poison, <2 x float> poison, i64 0)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> poison, i64 0)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> poison, <vscale x 2 x float> poison, i64 0)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> poison, i64 0)
; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; TYPE_BASED_ONLY-LABEL: 'vector_insert_extract_idxzero_256b'
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %insert_legal_fixed_into_scalable = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> undef, <16 x i16> undef, i64 0)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> undef, i64 0)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %insert_legal_fixed_into_scalable = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> poison, <16 x i16> poison, i64 0)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> poison, i64 0)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> poison, <vscale x 2 x i1> poison, i64 0)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> poison, i64 0)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> poison, i64 0)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> poison, <2 x float> poison, i64 0)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> poison, i64 0)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> poison, <vscale x 2 x float> poison, i64 0)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> poison, i64 0)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
- %insert_legal_fixed_into_scalable = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> undef, <16 x i16> undef, i64 0)
- %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nx4f32(<vscale x 4 x float> undef, i64 0)
- %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.v2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
- %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
- %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
- %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
- %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
- %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
- %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0)
+ %insert_legal_fixed_into_scalable = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> poison, <16 x i16> poison, i64 0)
+ %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nx4f32(<vscale x 4 x float> poison, i64 0)
+ %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.v2i1(<vscale x 16 x i1> poison, <vscale x 2 x i1> poison, i64 0)
+ %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> poison, i64 0)
+ %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> poison, i64 0)
+ %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> poison, <2 x float> poison, i64 0)
+ %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> poison, i64 0)
+ %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> poison, <vscale x 2 x float> poison, i64 0)
+ %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> poison, i64 0)
ret void
}
declare <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16>, <16 x i16>, i64)
@@ -148,157 +148,157 @@ declare <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float>, i64
define void @reductions(<vscale x 4 x i32> %v0, <vscale x 4 x i64> %v1, <vscale x 4 x float> %v2, <vscale x 4 x double> %v3) {
; CHECK-VSCALE-1-LABEL: 'reductions'
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of Invalid for: %add_nxv1i32 = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> undef)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of Invalid for: %add_nxv1i32 = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> poison)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 2 for: %add_nxv4i32 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %v0)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 3 for: %add_nxv4i64 = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> %v1)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of Invalid for: %mul_nxv1i32 = call i32 @llvm.vector.reduce.mul.nxv1i32(<vscale x 1 x i32> undef)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of Invalid for: %mul_nxv1i32 = call i32 @llvm.vector.reduce.mul.nxv1i32(<vscale x 1 x i32> poison)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of Invalid for: %mul_nxv4i32 = call i32 @llvm.vector.reduce.mul.nxv4i32(<vscale x 4 x i32> %v0)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of Invalid for: %mul_nxv4i64 = call i64 @llvm.vector.reduce.mul.nxv4i64(<vscale x 4 x i64> %v1)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of Invalid for: %and_nxv1i32 = call i32 @llvm.vector.reduce.and.nxv1i32(<vscale x 1 x i32> undef)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of Invalid for: %and_nxv1i32 = call i32 @llvm.vector.reduce.and.nxv1i32(<vscale x 1 x i32> poison)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 2 for: %and_nxv4i32 = call i32 @llvm.vector.reduce.and.nxv4i32(<vscale x 4 x i32> %v0)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 3 for: %and_nxv4i64 = call i64 @llvm.vector.reduce.and.nxv4i64(<vscale x 4 x i64> %v1)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of Invalid for: %or_nxv1i32 = call i32 @llvm.vector.reduce.or.nxv1i32(<vscale x 1 x i32> undef)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of Invalid for: %or_nxv1i32 = call i32 @llvm.vector.reduce.or.nxv1i32(<vscale x 1 x i32> poison)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 2 for: %or_nxv4i32 = call i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32> %v0)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 3 for: %or_nxv4i64 = call i64 @llvm.vector.reduce.or.nxv4i64(<vscale x 4 x i64> %v1)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of Invalid for: %xor_nxv1i32 = call i32 @llvm.vector.reduce.xor.nxv1i32(<vscale x 1 x i32> undef)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of Invalid for: %xor_nxv1i32 = call i32 @llvm.vector.reduce.xor.nxv1i32(<vscale x 1 x i32> poison)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 2 for: %xor_nxv4i32 = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> %v0)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 3 for: %xor_nxv4i64 = call i64 @llvm.vector.reduce.xor.nxv4i64(<vscale x 4 x i64> %v1)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of Invalid for: %umin_nxv1i64 = call i64 @llvm.vector.reduce.umin.nxv1i64(<vscale x 1 x i64> undef)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of Invalid for: %umin_nxv1i64 = call i64 @llvm.vector.reduce.umin.nxv1i64(<vscale x 1 x i64> poison)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 2 for: %umin_nxv4i32 = call i32 @llvm.vector.reduce.umin.nxv4i32(<vscale x 4 x i32> %v0)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 3 for: %umin_nxv4i64 = call i64 @llvm.vector.reduce.umin.nxv4i64(<vscale x 4 x i64> %v1)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of Invalid for: %smin_nxv1i64 = call i64 @llvm.vector.reduce.smin.nxv1i64(<vscale x 1 x i64> undef)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of Invalid for: %smin_nxv1i64 = call i64 @llvm.vector.reduce.smin.nxv1i64(<vscale x 1 x i64> poison)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 2 for: %smin_nxv4i32 = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> %v0)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 3 for: %smin_nxv4i64 = call i64 @llvm.vector.reduce.smin.nxv4i64(<vscale x 4 x i64> %v1)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of Invalid for: %umax_nxv1i64 = call i64 @llvm.vector.reduce.umax.nxv1i64(<vscale x 1 x i64> undef)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of Invalid for: %umax_nxv1i64 = call i64 @llvm.vector.reduce.umax.nxv1i64(<vscale x 1 x i64> poison)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 2 for: %umax_nxv4i32 = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> %v0)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 3 for: %umax_nxv4i64 = call i64 @llvm.vector.reduce.umax.nxv4i64(<vscale x 4 x i64> %v1)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of Invalid for: %smax_nxv1i64 = call i64 @llvm.vector.reduce.smax.nxv1i64(<vscale x 1 x i64> undef)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of Invalid for: %smax_nxv1i64 = call i64 @llvm.vector.reduce.smax.nxv1i64(<vscale x 1 x i64> poison)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 2 for: %smax_nxv4i32 = call i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32> %v0)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 3 for: %smax_nxv4i64 = call i64 @llvm.vector.reduce.smax.nxv4i64(<vscale x 4 x i64> %v1)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of Invalid for: %fadd_nxv1f32 = call fast float @llvm.vector.reduce.fadd.nxv1f32(float 0.000000e+00, <vscale x 1 x float> undef)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of Invalid for: %fadd_nxv1f32 = call fast float @llvm.vector.reduce.fadd.nxv1f32(float 0.000000e+00, <vscale x 1 x float> poison)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 2 for: %fadd_nxv4f32 = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> %v2)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %fadd_nxv4f64 = call fast double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, <vscale x 4 x double> %v3)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of Invalid for: %fmin_nxv1f32 = call fast float @llvm.vector.reduce.fmin.nxv1f32(<vscale x 1 x float> undef)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of Invalid for: %fmin_nxv1f32 = call fast float @llvm.vector.reduce.fmin.nxv1f32(<vscale x 1 x float> poison)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 2 for: %fmin_nxv4f32 = call fast float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> %v2)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 4 for: %fmin_nxv4f64 = call fast double @llvm.vector.reduce.fmin.nxv4f64(<vscale x 4 x double> %v3)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of Invalid for: %fmax_nxv1f32 = call fast float @llvm.vector.reduce.fmax.nxv1f32(<vscale x 1 x float> undef)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of Invalid for: %fmax_nxv1f32 = call fast float @llvm.vector.reduce.fmax.nxv1f32(<vscale x 1 x float> poison)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 2 for: %fmax_nxv4f32 = call fast float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float> %v2)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 4 for: %fmax_nxv4f64 = call fast double @llvm.vector.reduce.fmax.nxv4f64(<vscale x 4 x double> %v3)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-VSCALE-2-LABEL: 'reductions'
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of Invalid for: %add_nxv1i32 = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> undef)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of Invalid for: %add_nxv1i32 = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> poison)
; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 2 for: %add_nxv4i32 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %v0)
; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 3 for: %add_nxv4i64 = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> %v1)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of Invalid for: %mul_nxv1i32 = call i32 @llvm.vector.reduce.mul.nxv1i32(<vscale x 1 x i32> undef)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of Invalid for: %mul_nxv1i32 = call i32 @llvm.vector.reduce.mul.nxv1i32(<vscale x 1 x i32> poison)
; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of Invalid for: %mul_nxv4i32 = call i32 @llvm.vector.reduce.mul.nxv4i32(<vscale x 4 x i32> %v0)
; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of Invalid for: %mul_nxv4i64 = call i64 @llvm.vector.reduce.mul.nxv4i64(<vscale x 4 x i64> %v1)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of Invalid for: %and_nxv1i32 = call i32 @llvm.vector.reduce.and.nxv1i32(<vscale x 1 x i32> undef)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of Invalid for: %and_nxv1i32 = call i32 @llvm.vector.reduce.and.nxv1i32(<vscale x 1 x i32> poison)
; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 2 for: %and_nxv4i32 = call i32 @llvm.vector.reduce.and.nxv4i32(<vscale x 4 x i32> %v0)
; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 3 for: %and_nxv4i64 = call i64 @llvm.vector.reduce.and.nxv4i64(<vscale x 4 x i64> %v1)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of Invalid for: %or_nxv1i32 = call i32 @llvm.vector.reduce.or.nxv1i32(<vscale x 1 x i32> undef)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of Invalid for: %or_nxv1i32 = call i32 @llvm.vector.reduce.or.nxv1i32(<vscale x 1 x i32> poison)
; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 2 for: %or_nxv4i32 = call i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32> %v0)
; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 3 for: %or_nxv4i64 = call i64 @llvm.vector.reduce.or.nxv4i64(<vscale x 4 x i64> %v1)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of Invalid for: %xor_nxv1i32 = call i32 @llvm.vector.reduce.xor.nxv1i32(<vscale x 1 x i32> undef)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of Invalid for: %xor_nxv1i32 = call i32 @llvm.vector.reduce.xor.nxv1i32(<vscale x 1 x i32> poison)
; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 2 for: %xor_nxv4i32 = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> %v0)
; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 3 for: %xor_nxv4i64 = call i64 @llvm.vector.reduce.xor.nxv4i64(<vscale x 4 x i64> %v1)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of Invalid for: %umin_nxv1i64 = call i64 @llvm.vector.reduce.umin.nxv1i64(<vscale x 1 x i64> undef)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of Invalid for: %umin_nxv1i64 = call i64 @llvm.vector.reduce.umin.nxv1i64(<vscale x 1 x i64> poison)
; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 2 for: %umin_nxv4i32 = call i32 @llvm.vector.reduce.umin.nxv4i32(<vscale x 4 x i32> %v0)
; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 3 for: %umin_nxv4i64 = call i64 @llvm.vector.reduce.umin.nxv4i64(<vscale x 4 x i64> %v1)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of Invalid for: %smin_nxv1i64 = call i64 @llvm.vector.reduce.smin.nxv1i64(<vscale x 1 x i64> undef)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of Invalid for: %smin_nxv1i64 = call i64 @llvm.vector.reduce.smin.nxv1i64(<vscale x 1 x i64> poison)
; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 2 for: %smin_nxv4i32 = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> %v0)
; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 3 for: %smin_nxv4i64 = call i64 @llvm.vector.reduce.smin.nxv4i64(<vscale x 4 x i64> %v1)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of Invalid for: %umax_nxv1i64 = call i64 @llvm.vector.reduce.umax.nxv1i64(<vscale x 1 x i64> undef)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of Invalid for: %umax_nxv1i64 = call i64 @llvm.vector.reduce.umax.nxv1i64(<vscale x 1 x i64> poison)
; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 2 for: %umax_nxv4i32 = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> %v0)
; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 3 for: %umax_nxv4i64 = call i64 @llvm.vector.reduce.umax.nxv4i64(<vscale x 4 x i64> %v1)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of Invalid for: %smax_nxv1i64 = call i64 @llvm.vector.reduce.smax.nxv1i64(<vscale x 1 x i64> undef)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of Invalid for: %smax_nxv1i64 = call i64 @llvm.vector.reduce.smax.nxv1i64(<vscale x 1 x i64> poison)
; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 2 for: %smax_nxv4i32 = call i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32> %v0)
; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 3 for: %smax_nxv4i64 = call i64 @llvm.vector.reduce.smax.nxv4i64(<vscale x 4 x i64> %v1)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of Invalid for: %fadd_nxv1f32 = call fast float @llvm.vector.reduce.fadd.nxv1f32(float 0.000000e+00, <vscale x 1 x float> undef)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of Invalid for: %fadd_nxv1f32 = call fast float @llvm.vector.reduce.fadd.nxv1f32(float 0.000000e+00, <vscale x 1 x float> poison)
; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 2 for: %fadd_nxv4f32 = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> %v2)
; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %fadd_nxv4f64 = call fast double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, <vscale x 4 x double> %v3)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of Invalid for: %fmin_nxv1f32 = call fast float @llvm.vector.reduce.fmin.nxv1f32(<vscale x 1 x float> undef)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of Invalid for: %fmin_nxv1f32 = call fast float @llvm.vector.reduce.fmin.nxv1f32(<vscale x 1 x float> poison)
; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 2 for: %fmin_nxv4f32 = call fast float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> %v2)
; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 4 for: %fmin_nxv4f64 = call fast double @llvm.vector.reduce.fmin.nxv4f64(<vscale x 4 x double> %v3)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of Invalid for: %fmax_nxv1f32 = call fast float @llvm.vector.reduce.fmax.nxv1f32(<vscale x 1 x float> undef)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of Invalid for: %fmax_nxv1f32 = call fast float @llvm.vector.reduce.fmax.nxv1f32(<vscale x 1 x float> poison)
; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 2 for: %fmax_nxv4f32 = call fast float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float> %v2)
; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 4 for: %fmax_nxv4f64 = call fast double @llvm.vector.reduce.fmax.nxv4f64(<vscale x 4 x double> %v3)
; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; TYPE_BASED_ONLY-LABEL: 'reductions'
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %add_nxv1i32 = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %add_nxv1i32 = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> poison)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %add_nxv4i32 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %v0)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 3 for: %add_nxv4i64 = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> %v1)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %mul_nxv1i32 = call i32 @llvm.vector.reduce.mul.nxv1i32(<vscale x 1 x i32> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %mul_nxv1i32 = call i32 @llvm.vector.reduce.mul.nxv1i32(<vscale x 1 x i32> poison)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %mul_nxv4i32 = call i32 @llvm.vector.reduce.mul.nxv4i32(<vscale x 4 x i32> %v0)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %mul_nxv4i64 = call i64 @llvm.vector.reduce.mul.nxv4i64(<vscale x 4 x i64> %v1)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %and_nxv1i32 = call i32 @llvm.vector.reduce.and.nxv1i32(<vscale x 1 x i32> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %and_nxv1i32 = call i32 @llvm.vector.reduce.and.nxv1i32(<vscale x 1 x i32> poison)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %and_nxv4i32 = call i32 @llvm.vector.reduce.and.nxv4i32(<vscale x 4 x i32> %v0)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 3 for: %and_nxv4i64 = call i64 @llvm.vector.reduce.and.nxv4i64(<vscale x 4 x i64> %v1)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %or_nxv1i32 = call i32 @llvm.vector.reduce.or.nxv1i32(<vscale x 1 x i32> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %or_nxv1i32 = call i32 @llvm.vector.reduce.or.nxv1i32(<vscale x 1 x i32> poison)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %or_nxv4i32 = call i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32> %v0)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 3 for: %or_nxv4i64 = call i64 @llvm.vector.reduce.or.nxv4i64(<vscale x 4 x i64> %v1)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %xor_nxv1i32 = call i32 @llvm.vector.reduce.xor.nxv1i32(<vscale x 1 x i32> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %xor_nxv1i32 = call i32 @llvm.vector.reduce.xor.nxv1i32(<vscale x 1 x i32> poison)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %xor_nxv4i32 = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> %v0)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 3 for: %xor_nxv4i64 = call i64 @llvm.vector.reduce.xor.nxv4i64(<vscale x 4 x i64> %v1)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %umin_nxv1i64 = call i64 @llvm.vector.reduce.umin.nxv1i64(<vscale x 1 x i64> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %umin_nxv1i64 = call i64 @llvm.vector.reduce.umin.nxv1i64(<vscale x 1 x i64> poison)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %umin_nxv4i32 = call i32 @llvm.vector.reduce.umin.nxv4i32(<vscale x 4 x i32> %v0)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 3 for: %umin_nxv4i64 = call i64 @llvm.vector.reduce.umin.nxv4i64(<vscale x 4 x i64> %v1)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %smin_nxv1i64 = call i64 @llvm.vector.reduce.smin.nxv1i64(<vscale x 1 x i64> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %smin_nxv1i64 = call i64 @llvm.vector.reduce.smin.nxv1i64(<vscale x 1 x i64> poison)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %smin_nxv4i32 = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> %v0)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 3 for: %smin_nxv4i64 = call i64 @llvm.vector.reduce.smin.nxv4i64(<vscale x 4 x i64> %v1)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %umax_nxv1i64 = call i64 @llvm.vector.reduce.umax.nxv1i64(<vscale x 1 x i64> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %umax_nxv1i64 = call i64 @llvm.vector.reduce.umax.nxv1i64(<vscale x 1 x i64> poison)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %umax_nxv4i32 = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> %v0)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 3 for: %umax_nxv4i64 = call i64 @llvm.vector.reduce.umax.nxv4i64(<vscale x 4 x i64> %v1)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %smax_nxv1i64 = call i64 @llvm.vector.reduce.smax.nxv1i64(<vscale x 1 x i64> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %smax_nxv1i64 = call i64 @llvm.vector.reduce.smax.nxv1i64(<vscale x 1 x i64> poison)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %smax_nxv4i32 = call i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32> %v0)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 3 for: %smax_nxv4i64 = call i64 @llvm.vector.reduce.smax.nxv4i64(<vscale x 4 x i64> %v1)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %fadd_nxv1f32 = call fast float @llvm.vector.reduce.fadd.nxv1f32(float 0.000000e+00, <vscale x 1 x float> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %fadd_nxv1f32 = call fast float @llvm.vector.reduce.fadd.nxv1f32(float 0.000000e+00, <vscale x 1 x float> poison)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %fadd_nxv4f32 = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> %v2)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %fadd_nxv4f64 = call fast double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, <vscale x 4 x double> %v3)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %fmin_nxv1f32 = call fast float @llvm.vector.reduce.fmin.nxv1f32(<vscale x 1 x float> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %fmin_nxv1f32 = call fast float @llvm.vector.reduce.fmin.nxv1f32(<vscale x 1 x float> poison)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %fmin_nxv4f32 = call fast float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> %v2)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 4 for: %fmin_nxv4f64 = call fast double @llvm.vector.reduce.fmin.nxv4f64(<vscale x 4 x double> %v3)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %fmax_nxv1f32 = call fast float @llvm.vector.reduce.fmax.nxv1f32(<vscale x 1 x float> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of Invalid for: %fmax_nxv1f32 = call fast float @llvm.vector.reduce.fmax.nxv1f32(<vscale x 1 x float> poison)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %fmax_nxv4f32 = call fast float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float> %v2)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 4 for: %fmax_nxv4f64 = call fast double @llvm.vector.reduce.fmax.nxv4f64(<vscale x 4 x double> %v3)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
- %add_nxv1i32 = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> undef)
+ %add_nxv1i32 = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> poison)
%add_nxv4i32 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %v0)
%add_nxv4i64 = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> %v1)
- %mul_nxv1i32 = call i32 @llvm.vector.reduce.mul.nxv1i32(<vscale x 1 x i32> undef)
+ %mul_nxv1i32 = call i32 @llvm.vector.reduce.mul.nxv1i32(<vscale x 1 x i32> poison)
%mul_nxv4i32 = call i32 @llvm.vector.reduce.mul.nxv4i32(<vscale x 4 x i32> %v0)
%mul_nxv4i64 = call i64 @llvm.vector.reduce.mul.nxv4i64(<vscale x 4 x i64> %v1)
- %and_nxv1i32 = call i32 @llvm.vector.reduce.and.nxv1i32(<vscale x 1 x i32> undef)
+ %and_nxv1i32 = call i32 @llvm.vector.reduce.and.nxv1i32(<vscale x 1 x i32> poison)
%and_nxv4i32 = call i32 @llvm.vector.reduce.and.nxv4i32(<vscale x 4 x i32> %v0)
%and_nxv4i64 = call i64 @llvm.vector.reduce.and.nxv4i64(<vscale x 4 x i64> %v1)
- %or_nxv1i32 = call i32 @llvm.vector.reduce.or.nxv1i32(<vscale x 1 x i32> undef)
+ %or_nxv1i32 = call i32 @llvm.vector.reduce.or.nxv1i32(<vscale x 1 x i32> poison)
%or_nxv4i32 = call i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32> %v0)
%or_nxv4i64 = call i64 @llvm.vector.reduce.or.nxv4i64(<vscale x 4 x i64> %v1)
- %xor_nxv1i32 = call i32 @llvm.vector.reduce.xor.nxv1i32(<vscale x 1 x i32> undef)
+ %xor_nxv1i32 = call i32 @llvm.vector.reduce.xor.nxv1i32(<vscale x 1 x i32> poison)
%xor_nxv4i32 = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> %v0)
%xor_nxv4i64 = call i64 @llvm.vector.reduce.xor.nxv4i64(<vscale x 4 x i64> %v1)
- %umin_nxv1i64 = call i64 @llvm.vector.reduce.umin.nxv1i64(<vscale x 1 x i64> undef)
+ %umin_nxv1i64 = call i64 @llvm.vector.reduce.umin.nxv1i64(<vscale x 1 x i64> poison)
%umin_nxv4i32 = call i32 @llvm.vector.reduce.umin.nxv4i32(<vscale x 4 x i32> %v0)
%umin_nxv4i64 = call i64 @llvm.vector.reduce.umin.nxv4i64(<vscale x 4 x i64> %v1)
- %smin_nxv1i64 = call i64 @llvm.vector.reduce.smin.nxv1i64(<vscale x 1 x i64> undef)
+ %smin_nxv1i64 = call i64 @llvm.vector.reduce.smin.nxv1i64(<vscale x 1 x i64> poison)
%smin_nxv4i32 = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> %v0)
%smin_nxv4i64 = call i64 @llvm.vector.reduce.smin.nxv4i64(<vscale x 4 x i64> %v1)
- %umax_nxv1i64 = call i64 @llvm.vector.reduce.umax.nxv1i64(<vscale x 1 x i64> undef)
+ %umax_nxv1i64 = call i64 @llvm.vector.reduce.umax.nxv1i64(<vscale x 1 x i64> poison)
%umax_nxv4i32 = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> %v0)
%umax_nxv4i64 = call i64 @llvm.vector.reduce.umax.nxv4i64(<vscale x 4 x i64> %v1)
- %smax_nxv1i64 = call i64 @llvm.vector.reduce.smax.nxv1i64(<vscale x 1 x i64> undef)
+ %smax_nxv1i64 = call i64 @llvm.vector.reduce.smax.nxv1i64(<vscale x 1 x i64> poison)
%smax_nxv4i32 = call i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32> %v0)
%smax_nxv4i64 = call i64 @llvm.vector.reduce.smax.nxv4i64(<vscale x 4 x i64> %v1)
- %fadd_nxv1f32 = call fast float @llvm.vector.reduce.fadd.nxv1f32(float 0.0, <vscale x 1 x float> undef)
+ %fadd_nxv1f32 = call fast float @llvm.vector.reduce.fadd.nxv1f32(float 0.0, <vscale x 1 x float> poison)
%fadd_nxv4f32 = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.0, <vscale x 4 x float> %v2)
%fadd_nxv4f64 = call fast double @llvm.vector.reduce.fadd.nxv4f64(double 0.0, <vscale x 4 x double> %v3)
- %fmin_nxv1f32 = call fast float @llvm.vector.reduce.fmin.nxv1f32(<vscale x 1 x float> undef)
+ %fmin_nxv1f32 = call fast float @llvm.vector.reduce.fmin.nxv1f32(<vscale x 1 x float> poison)
%fmin_nxv4f32 = call fast float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> %v2)
%fmin_nxv4f64 = call fast double @llvm.vector.reduce.fmin.nxv4f64(<vscale x 4 x double> %v3)
- %fmax_nxv1f32 = call fast float @llvm.vector.reduce.fmax.nxv1f32(<vscale x 1 x float> undef)
+ %fmax_nxv1f32 = call fast float @llvm.vector.reduce.fmax.nxv1f32(<vscale x 1 x float> poison)
%fmax_nxv4f32 = call fast float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float> %v2)
%fmax_nxv4f64 = call fast double @llvm.vector.reduce.fmax.nxv4f64(<vscale x 4 x double> %v3)
@@ -389,123 +389,123 @@ declare <vscale x 4 x i32> @llvm.cttz.nxv4i32(<vscale x 4 x i32>, i1)
define void @vector_reverse() #0 {
; CHECK-VSCALE-1-LABEL: 'vector_reverse'
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2f16 = call <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4f16 = call <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv8f16 = call <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv16f16 = call <vscale x 16 x half> @llvm.vector.reverse.nxv16f16(<vscale x 16 x half> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2f32 = call <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4f32 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv8f32 = call <vscale x 8 x float> @llvm.vector.reverse.nxv8f32(<vscale x 8 x float> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2f64 = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv4f64 = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.reverse.nxv16bf16(<vscale x 16 x bfloat> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2f16 = call <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half> poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4f16 = call <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half> poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv8f16 = call <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half> poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv16f16 = call <vscale x 16 x half> @llvm.vector.reverse.nxv16f16(<vscale x 16 x half> poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2f32 = call <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float> poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4f32 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv8f32 = call <vscale x 8 x float> @llvm.vector.reverse.nxv8f32(<vscale x 8 x float> poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2f64 = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv4f64 = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.reverse.nxv16bf16(<vscale x 16 x bfloat> poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> poison)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-VSCALE-2-LABEL: 'vector_reverse'
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2f16 = call <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4f16 = call <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv8f16 = call <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv16f16 = call <vscale x 16 x half> @llvm.vector.reverse.nxv16f16(<vscale x 16 x half> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2f32 = call <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4f32 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv8f32 = call <vscale x 8 x float> @llvm.vector.reverse.nxv8f32(<vscale x 8 x float> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2f64 = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv4f64 = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.reverse.nxv16bf16(<vscale x 16 x bfloat> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2f16 = call <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half> poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4f16 = call <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half> poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv8f16 = call <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half> poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv16f16 = call <vscale x 16 x half> @llvm.vector.reverse.nxv16f16(<vscale x 16 x half> poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2f32 = call <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float> poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4f32 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv8f32 = call <vscale x 8 x float> @llvm.vector.reverse.nxv8f32(<vscale x 8 x float> poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2f64 = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv4f64 = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.reverse.nxv16bf16(<vscale x 16 x bfloat> poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> poison)
; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; TYPE_BASED_ONLY-LABEL: 'vector_reverse'
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2f16 = call <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4f16 = call <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv8f16 = call <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv16f16 = call <vscale x 16 x half> @llvm.vector.reverse.nxv16f16(<vscale x 16 x half> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2f32 = call <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4f32 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv8f32 = call <vscale x 8 x float> @llvm.vector.reverse.nxv8f32(<vscale x 8 x float> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2f64 = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv4f64 = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.reverse.nxv16bf16(<vscale x 16 x bfloat> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2f16 = call <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half> poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4f16 = call <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half> poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv8f16 = call <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half> poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv16f16 = call <vscale x 16 x half> @llvm.vector.reverse.nxv16f16(<vscale x 16 x half> poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2f32 = call <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float> poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4f32 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv8f32 = call <vscale x 8 x float> @llvm.vector.reverse.nxv8f32(<vscale x 8 x float> poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2f64 = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv4f64 = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %reverse_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.reverse.nxv16bf16(<vscale x 16 x bfloat> poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> poison)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
- %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
- %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
- %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
- %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
- %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
- %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
- %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
- %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
- %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
- %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
- %reverse_nxv2f16 = call <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half> undef)
- %reverse_nxv4f16 = call <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half> undef)
- %reverse_nxv8f16 = call <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half> undef)
- %reverse_nxv16f16 = call <vscale x 16 x half> @llvm.vector.reverse.nxv16f16(<vscale x 16 x half> undef)
- %reverse_nxv2f32 = call <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float> undef)
- %reverse_nxv4f32 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> undef)
- %reverse_nxv8f32 = call <vscale x 8 x float> @llvm.vector.reverse.nxv8f32(<vscale x 8 x float> undef)
- %reverse_nxv2f64 = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> undef)
- %reverse_nxv4f64 = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> undef)
- %reverse_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> undef)
- %reverse_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> undef)
- %reverse_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> undef)
- %reverse_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.reverse.nxv16bf16(<vscale x 16 x bfloat> undef)
- %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
- %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
- %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
- %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
+ %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> poison)
+ %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> poison)
+ %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> poison)
+ %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> poison)
+ %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> poison)
+ %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> poison)
+ %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> poison)
+ %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> poison)
+ %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> poison)
+ %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> poison)
+ %reverse_nxv2f16 = call <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half> poison)
+ %reverse_nxv4f16 = call <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half> poison)
+ %reverse_nxv8f16 = call <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half> poison)
+ %reverse_nxv16f16 = call <vscale x 16 x half> @llvm.vector.reverse.nxv16f16(<vscale x 16 x half> poison)
+ %reverse_nxv2f32 = call <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float> poison)
+ %reverse_nxv4f32 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> poison)
+ %reverse_nxv8f32 = call <vscale x 8 x float> @llvm.vector.reverse.nxv8f32(<vscale x 8 x float> poison)
+ %reverse_nxv2f64 = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> poison)
+ %reverse_nxv4f64 = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> poison)
+ %reverse_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> poison)
+ %reverse_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> poison)
+ %reverse_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> poison)
+ %reverse_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.reverse.nxv16bf16(<vscale x 16 x bfloat> poison)
+ %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> poison)
+ %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> poison)
+ %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> poison)
+ %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> poison)
ret void
}
declare <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8>)
@@ -912,158 +912,158 @@ declare <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double>,
define void @get_lane_mask() #0 {
; CHECK-VSCALE-1-LABEL: 'get_lane_mask'
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i64 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i64 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i64 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i32 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 2 for: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 undef, i16 undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 8 for: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 4 for: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 8 for: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 4 for: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 48 for: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 6 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i64 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i64 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i64 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i32 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 poison, i32 poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 poison, i32 poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 2 for: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 8 for: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 4 for: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 poison, i32 poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 poison, i32 poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 8 for: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 poison, i32 poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 4 for: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 poison, i32 poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 48 for: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 6 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 poison, i16 poison)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-VSCALE-2-LABEL: 'get_lane_mask'
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i64 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i64 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i64 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i32 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 2 for: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 undef, i16 undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 8 for: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 4 for: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 8 for: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 4 for: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 48 for: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 6 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i64 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i64 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i64 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i32 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 poison, i32 poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 poison, i32 poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 2 for: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 8 for: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 4 for: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 poison, i32 poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 poison, i32 poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 8 for: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 poison, i32 poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 4 for: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 poison, i32 poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 48 for: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 6 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 poison, i16 poison)
; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; TYPE_BASED_ONLY-LABEL: 'get_lane_mask'
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 undef, i64 undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i64 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 undef, i64 undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i64 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 undef, i64 undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i64 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 undef, i64 undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i32 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 undef, i32 undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 undef, i32 undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 undef, i32 undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 undef, i32 undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 undef, i64 undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 undef, i16 undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 undef, i64 undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 undef, i64 undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 8 for: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 undef, i64 undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 4 for: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 undef, i64 undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 undef, i32 undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 undef, i32 undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 8 for: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 undef, i32 undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 4 for: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 undef, i32 undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 48 for: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 6 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 poison, i64 poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i64 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 poison, i64 poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i64 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 poison, i64 poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i64 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 poison, i64 poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i32 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 poison, i32 poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 poison, i32 poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 poison, i64 poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 poison, i64 poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 8 for: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 poison, i64 poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 4 for: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 poison, i64 poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 poison, i32 poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 poison, i32 poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 8 for: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 poison, i32 poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 4 for: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 poison, i32 poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 48 for: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 poison, i64 poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 6 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 poison, i16 poison)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
- %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 undef, i64 undef)
- %mask_nxv8i1_i64 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 undef, i64 undef)
- %mask_nxv4i1_i64 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 undef, i64 undef)
- %mask_nxv2i1_i64 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 undef, i64 undef)
+ %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 poison, i64 poison)
+ %mask_nxv8i1_i64 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 poison, i64 poison)
+ %mask_nxv4i1_i64 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 poison, i64 poison)
+ %mask_nxv2i1_i64 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 poison, i64 poison)
- %mask_nxv16i1_i32 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 undef, i32 undef)
- %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 undef, i32 undef)
- %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 undef, i32 undef)
- %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 undef, i32 undef)
+ %mask_nxv16i1_i32 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 poison, i32 poison)
+ %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 poison, i32 poison)
+ %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison)
+ %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison)
- %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 undef, i64 undef)
- %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 undef, i16 undef)
+ %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison)
+ %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison)
- %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 undef, i64 undef)
- %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 undef, i64 undef)
- %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 undef, i64 undef)
- %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 undef, i64 undef)
+ %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 poison, i64 poison)
+ %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 poison, i64 poison)
+ %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 poison, i64 poison)
+ %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 poison, i64 poison)
- %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 undef, i32 undef)
- %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 undef, i32 undef)
- %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 undef, i32 undef)
- %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 undef, i32 undef)
+ %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 poison, i32 poison)
+ %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 poison, i32 poison)
+ %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 poison, i32 poison)
+ %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 poison, i32 poison)
- %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef)
- %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef)
+ %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 poison, i64 poison)
+ %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 poison, i16 poison)
ret void
}
define void @fshr() #0 {
; CHECK-VSCALE-1-LABEL: 'fshr'
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 5 for: %1 = call <vscale x 16 x i8> @llvm.fshr.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i8> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 5 for: %2 = call <vscale x 8 x i16> @llvm.fshr.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i16> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 5 for: %3 = call <vscale x 4 x i32> @llvm.fshr.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i32> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 5 for: %4 = call <vscale x 2 x i64> @llvm.fshr.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i64> undef)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 5 for: %1 = call <vscale x 16 x i8> @llvm.fshr.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i8> poison, <vscale x 16 x i8> poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 5 for: %2 = call <vscale x 8 x i16> @llvm.fshr.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i16> poison, <vscale x 8 x i16> poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 5 for: %3 = call <vscale x 4 x i32> @llvm.fshr.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> poison, <vscale x 4 x i32> poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 5 for: %4 = call <vscale x 2 x i64> @llvm.fshr.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i64> poison)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-VSCALE-2-LABEL: 'fshr'
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 5 for: %1 = call <vscale x 16 x i8> @llvm.fshr.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i8> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 5 for: %2 = call <vscale x 8 x i16> @llvm.fshr.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i16> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 5 for: %3 = call <vscale x 4 x i32> @llvm.fshr.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i32> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 5 for: %4 = call <vscale x 2 x i64> @llvm.fshr.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i64> undef)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 5 for: %1 = call <vscale x 16 x i8> @llvm.fshr.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i8> poison, <vscale x 16 x i8> poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 5 for: %2 = call <vscale x 8 x i16> @llvm.fshr.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i16> poison, <vscale x 8 x i16> poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 5 for: %3 = call <vscale x 4 x i32> @llvm.fshr.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> poison, <vscale x 4 x i32> poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 5 for: %4 = call <vscale x 2 x i64> @llvm.fshr.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i64> poison)
; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; TYPE_BASED_ONLY-LABEL: 'fshr'
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 7 for: %1 = call <vscale x 16 x i8> @llvm.fshr.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i8> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 7 for: %2 = call <vscale x 8 x i16> @llvm.fshr.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i16> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 7 for: %3 = call <vscale x 4 x i32> @llvm.fshr.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i32> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 7 for: %4 = call <vscale x 2 x i64> @llvm.fshr.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i64> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 7 for: %1 = call <vscale x 16 x i8> @llvm.fshr.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i8> poison, <vscale x 16 x i8> poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 7 for: %2 = call <vscale x 8 x i16> @llvm.fshr.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i16> poison, <vscale x 8 x i16> poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 7 for: %3 = call <vscale x 4 x i32> @llvm.fshr.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> poison, <vscale x 4 x i32> poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 7 for: %4 = call <vscale x 2 x i64> @llvm.fshr.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i64> poison)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
- call <vscale x 16 x i8> @llvm.fshr.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i8> undef)
- call <vscale x 8 x i16> @llvm.fshr.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i16> undef)
- call <vscale x 4 x i32> @llvm.fshr.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i32> undef)
- call <vscale x 2 x i64> @llvm.fshr.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i64> undef)
+ call <vscale x 16 x i8> @llvm.fshr.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i8> poison, <vscale x 16 x i8> poison)
+ call <vscale x 8 x i16> @llvm.fshr.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i16> poison, <vscale x 8 x i16> poison)
+ call <vscale x 4 x i32> @llvm.fshr.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> poison, <vscale x 4 x i32> poison)
+ call <vscale x 2 x i64> @llvm.fshr.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i64> poison)
ret void
}
define void @fshl() #0 {
; CHECK-VSCALE-1-LABEL: 'fshl'
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 5 for: %1 = call <vscale x 16 x i8> @llvm.fshl.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i8> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 5 for: %2 = call <vscale x 8 x i16> @llvm.fshl.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i16> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 5 for: %3 = call <vscale x 4 x i32> @llvm.fshl.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i32> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 5 for: %4 = call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i64> undef)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 5 for: %1 = call <vscale x 16 x i8> @llvm.fshl.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i8> poison, <vscale x 16 x i8> poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 5 for: %2 = call <vscale x 8 x i16> @llvm.fshl.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i16> poison, <vscale x 8 x i16> poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 5 for: %3 = call <vscale x 4 x i32> @llvm.fshl.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> poison, <vscale x 4 x i32> poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 5 for: %4 = call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i64> poison)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-VSCALE-2-LABEL: 'fshl'
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 5 for: %1 = call <vscale x 16 x i8> @llvm.fshl.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i8> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 5 for: %2 = call <vscale x 8 x i16> @llvm.fshl.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i16> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 5 for: %3 = call <vscale x 4 x i32> @llvm.fshl.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i32> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 5 for: %4 = call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i64> undef)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 5 for: %1 = call <vscale x 16 x i8> @llvm.fshl.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i8> poison, <vscale x 16 x i8> poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 5 for: %2 = call <vscale x 8 x i16> @llvm.fshl.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i16> poison, <vscale x 8 x i16> poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 5 for: %3 = call <vscale x 4 x i32> @llvm.fshl.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> poison, <vscale x 4 x i32> poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 5 for: %4 = call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i64> poison)
; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; TYPE_BASED_ONLY-LABEL: 'fshl'
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 7 for: %1 = call <vscale x 16 x i8> @llvm.fshl.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i8> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 7 for: %2 = call <vscale x 8 x i16> @llvm.fshl.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i16> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 7 for: %3 = call <vscale x 4 x i32> @llvm.fshl.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i32> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 7 for: %4 = call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i64> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 7 for: %1 = call <vscale x 16 x i8> @llvm.fshl.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i8> poison, <vscale x 16 x i8> poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 7 for: %2 = call <vscale x 8 x i16> @llvm.fshl.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i16> poison, <vscale x 8 x i16> poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 7 for: %3 = call <vscale x 4 x i32> @llvm.fshl.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> poison, <vscale x 4 x i32> poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 7 for: %4 = call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i64> poison)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
- call <vscale x 16 x i8> @llvm.fshl.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i8> undef)
- call <vscale x 8 x i16> @llvm.fshl.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i16> undef)
- call <vscale x 4 x i32> @llvm.fshl.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i32> undef)
- call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i64> undef)
+ call <vscale x 16 x i8> @llvm.fshl.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i8> poison, <vscale x 16 x i8> poison)
+ call <vscale x 8 x i16> @llvm.fshl.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i16> poison, <vscale x 8 x i16> poison)
+ call <vscale x 4 x i32> @llvm.fshl.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> poison, <vscale x 4 x i32> poison)
+ call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i64> poison)
ret void
}
@@ -1362,48 +1362,48 @@ define void @histogram_nxv4i64(<vscale x 4 x ptr> %buckets, <vscale x 4 x i1> %m
define void @match() #3 {
; CHECK-VSCALE-1-LABEL: 'match'
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 4 for: %match_nxv16i8_v16i8 = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> undef, <vscale x 16 x i1> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 4 for: %match_nxv8i16_v8i16 = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> undef, <vscale x 8 x i1> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 14 for: %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 14 for: %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 4 for: %match_nxv16i8_v16i8 = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> poison, <16 x i8> poison, <vscale x 16 x i1> poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 4 for: %match_nxv8i16_v8i16 = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> poison, <8 x i16> poison, <vscale x 8 x i1> poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> poison, <vscale x 4 x i1> poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> poison, <2 x i64> poison, <vscale x 2 x i1> poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 14 for: %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> poison, <16 x i8> poison, <16 x i1> poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 14 for: %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> poison, <8 x i16> poison, <8 x i1> poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> poison, <4 x i32> poison, <4 x i1> poison)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> poison, <2 x i64> poison, <2 x i1> poison)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; CHECK-VSCALE-2-LABEL: 'match'
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 4 for: %match_nxv16i8_v16i8 = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> undef, <vscale x 16 x i1> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 4 for: %match_nxv8i16_v8i16 = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> undef, <vscale x 8 x i1> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 14 for: %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 14 for: %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 4 for: %match_nxv16i8_v16i8 = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> poison, <16 x i8> poison, <vscale x 16 x i1> poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 4 for: %match_nxv8i16_v8i16 = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> poison, <8 x i16> poison, <vscale x 8 x i1> poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> poison, <vscale x 4 x i1> poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> poison, <2 x i64> poison, <vscale x 2 x i1> poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 14 for: %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> poison, <16 x i8> poison, <16 x i1> poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 14 for: %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> poison, <8 x i16> poison, <8 x i1> poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> poison, <4 x i32> poison, <4 x i1> poison)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> poison, <2 x i64> poison, <2 x i1> poison)
; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; TYPE_BASED_ONLY-LABEL: 'match'
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 4 for: %match_nxv16i8_v16i8 = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> undef, <vscale x 16 x i1> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 4 for: %match_nxv8i16_v8i16 = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> undef, <vscale x 8 x i1> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 14 for: %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 14 for: %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 4 for: %match_nxv16i8_v16i8 = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> poison, <16 x i8> poison, <vscale x 16 x i1> poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 4 for: %match_nxv8i16_v8i16 = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> poison, <8 x i16> poison, <vscale x 8 x i1> poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> poison, <vscale x 4 x i1> poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> poison, <2 x i64> poison, <vscale x 2 x i1> poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 14 for: %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> poison, <16 x i8> poison, <16 x i1> poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 14 for: %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> poison, <8 x i16> poison, <8 x i1> poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> poison, <4 x i32> poison, <4 x i1> poison)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> poison, <2 x i64> poison, <2 x i1> poison)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
- %match_nxv16i8_v16i8 = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> undef, <vscale x 16 x i1> undef)
- %match_nxv8i16_v8i16 = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> undef, <vscale x 8 x i1> undef)
- %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
- %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
+ %match_nxv16i8_v16i8 = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> poison, <16 x i8> poison, <vscale x 16 x i1> poison)
+ %match_nxv8i16_v8i16 = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> poison, <8 x i16> poison, <vscale x 8 x i1> poison)
+ %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> poison, <vscale x 4 x i1> poison)
+ %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> poison, <2 x i64> poison, <vscale x 2 x i1> poison)
- %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef)
- %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef)
- %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
- %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
+ %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> poison, <16 x i8> poison, <16 x i1> poison)
+ %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> poison, <8 x i16> poison, <8 x i1> poison)
+ %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> poison, <4 x i32> poison, <4 x i1> poison)
+ %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> poison, <2 x i64> poison, <2 x i1> poison)
ret void
}
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-trunc.ll b/llvm/test/Analysis/CostModel/AArch64/sve-trunc.ll
index 397b737..f7d3719 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-trunc.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-trunc.ll
@@ -5,82 +5,82 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
define void @sve_truncs() {
; CHECK-LABEL: 'sve_truncs'
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv2i8_to_i1 = trunc <vscale x 2 x i8> undef to <vscale x 2 x i1>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv2i16_to_i1 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i1>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv2i32_to_i1 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i1>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv2i64_to_i1 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i1>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv4i8_to_i1 = trunc <vscale x 4 x i8> undef to <vscale x 4 x i1>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv4i16_to_i1 = trunc <vscale x 4 x i16> undef to <vscale x 4 x i1>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv4i32_to_i1 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i1>
-; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv4i64_to_i1 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i1>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv8i8_to_i1 = trunc <vscale x 8 x i8> undef to <vscale x 8 x i1>
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv8i16_to_i1 = trunc <vscale x 8 x i16> undef to <vscale x 8 x i1>
-; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv8i32_to_i1 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i1>
-; CHECK-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv8i64_to_i1 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i1>
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %trunc_nxv2i16_to_i8 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i8>
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %trunc_nxv2i32_to_i8 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i8>
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %trunc_nxv2i64_to_i8 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i8>
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %trunc_nxv2i32_to_i16 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i16>
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %trunc_nxv2i64_to_i16 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i16>
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %trunc_nxv2i64_to_i32 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i32>
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %trunc_nxv4i16_to_i8 = trunc <vscale x 4 x i16> undef to <vscale x 4 x i8>
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %trunc_nxv4i32_to_i8 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i8>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %trunc_nxv4i64_to_i8 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i8>
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %trunc_nxv4i32_to_i16 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i16>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %trunc_nxv4i64_to_i16 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i16>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %trunc_nxv4i64_to_i32 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i32>
-; CHECK-NEXT: Cost Model: Found costs of 0 for: %trunc_nxv8i16_to_i8 = trunc <vscale x 8 x i16> undef to <vscale x 8 x i8>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %trunc_nxv8i32_to_i8 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i8>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv8i64_to_i8 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i8>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %trunc_nxv8i32_to_i16 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i16>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv8i64_to_i16 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i16>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %trunc_nxv16i16_to_i8 = trunc <vscale x 16 x i16> undef to <vscale x 16 x i8>
-; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv16i32_to_i8 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i8>
-; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv16i64_to_i8 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i8>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv2i8_to_i1 = trunc <vscale x 2 x i8> poison to <vscale x 2 x i1>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv2i16_to_i1 = trunc <vscale x 2 x i16> poison to <vscale x 2 x i1>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv2i32_to_i1 = trunc <vscale x 2 x i32> poison to <vscale x 2 x i1>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv2i64_to_i1 = trunc <vscale x 2 x i64> poison to <vscale x 2 x i1>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv4i8_to_i1 = trunc <vscale x 4 x i8> poison to <vscale x 4 x i1>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv4i16_to_i1 = trunc <vscale x 4 x i16> poison to <vscale x 4 x i1>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv4i32_to_i1 = trunc <vscale x 4 x i32> poison to <vscale x 4 x i1>
+; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv4i64_to_i1 = trunc <vscale x 4 x i64> poison to <vscale x 4 x i1>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv8i8_to_i1 = trunc <vscale x 8 x i8> poison to <vscale x 8 x i1>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv8i16_to_i1 = trunc <vscale x 8 x i16> poison to <vscale x 8 x i1>
+; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv8i32_to_i1 = trunc <vscale x 8 x i32> poison to <vscale x 8 x i1>
+; CHECK-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv8i64_to_i1 = trunc <vscale x 8 x i64> poison to <vscale x 8 x i1>
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %trunc_nxv2i16_to_i8 = trunc <vscale x 2 x i16> poison to <vscale x 2 x i8>
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %trunc_nxv2i32_to_i8 = trunc <vscale x 2 x i32> poison to <vscale x 2 x i8>
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %trunc_nxv2i64_to_i8 = trunc <vscale x 2 x i64> poison to <vscale x 2 x i8>
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %trunc_nxv2i32_to_i16 = trunc <vscale x 2 x i32> poison to <vscale x 2 x i16>
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %trunc_nxv2i64_to_i16 = trunc <vscale x 2 x i64> poison to <vscale x 2 x i16>
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %trunc_nxv2i64_to_i32 = trunc <vscale x 2 x i64> poison to <vscale x 2 x i32>
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %trunc_nxv4i16_to_i8 = trunc <vscale x 4 x i16> poison to <vscale x 4 x i8>
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %trunc_nxv4i32_to_i8 = trunc <vscale x 4 x i32> poison to <vscale x 4 x i8>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %trunc_nxv4i64_to_i8 = trunc <vscale x 4 x i64> poison to <vscale x 4 x i8>
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %trunc_nxv4i32_to_i16 = trunc <vscale x 4 x i32> poison to <vscale x 4 x i16>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %trunc_nxv4i64_to_i16 = trunc <vscale x 4 x i64> poison to <vscale x 4 x i16>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %trunc_nxv4i64_to_i32 = trunc <vscale x 4 x i64> poison to <vscale x 4 x i32>
+; CHECK-NEXT: Cost Model: Found costs of 0 for: %trunc_nxv8i16_to_i8 = trunc <vscale x 8 x i16> poison to <vscale x 8 x i8>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %trunc_nxv8i32_to_i8 = trunc <vscale x 8 x i32> poison to <vscale x 8 x i8>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv8i64_to_i8 = trunc <vscale x 8 x i64> poison to <vscale x 8 x i8>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %trunc_nxv8i32_to_i16 = trunc <vscale x 8 x i32> poison to <vscale x 8 x i16>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv8i64_to_i16 = trunc <vscale x 8 x i64> poison to <vscale x 8 x i16>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %trunc_nxv16i16_to_i8 = trunc <vscale x 16 x i16> poison to <vscale x 16 x i8>
+; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv16i32_to_i8 = trunc <vscale x 16 x i32> poison to <vscale x 16 x i8>
+; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv16i64_to_i8 = trunc <vscale x 16 x i64> poison to <vscale x 16 x i8>
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
- %trunc_nxv2i8_to_i1 = trunc <vscale x 2 x i8> undef to <vscale x 2 x i1>
- %trunc_nxv2i16_to_i1 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i1>
- %trunc_nxv2i32_to_i1 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i1>
- %trunc_nxv2i64_to_i1 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i1>
+ %trunc_nxv2i8_to_i1 = trunc <vscale x 2 x i8> poison to <vscale x 2 x i1>
+ %trunc_nxv2i16_to_i1 = trunc <vscale x 2 x i16> poison to <vscale x 2 x i1>
+ %trunc_nxv2i32_to_i1 = trunc <vscale x 2 x i32> poison to <vscale x 2 x i1>
+ %trunc_nxv2i64_to_i1 = trunc <vscale x 2 x i64> poison to <vscale x 2 x i1>
- %trunc_nxv4i8_to_i1 = trunc <vscale x 4 x i8> undef to <vscale x 4 x i1>
- %trunc_nxv4i16_to_i1 = trunc <vscale x 4 x i16> undef to <vscale x 4 x i1>
- %trunc_nxv4i32_to_i1 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i1>
- %trunc_nxv4i64_to_i1 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i1>
+ %trunc_nxv4i8_to_i1 = trunc <vscale x 4 x i8> poison to <vscale x 4 x i1>
+ %trunc_nxv4i16_to_i1 = trunc <vscale x 4 x i16> poison to <vscale x 4 x i1>
+ %trunc_nxv4i32_to_i1 = trunc <vscale x 4 x i32> poison to <vscale x 4 x i1>
+ %trunc_nxv4i64_to_i1 = trunc <vscale x 4 x i64> poison to <vscale x 4 x i1>
- %trunc_nxv8i8_to_i1 = trunc <vscale x 8 x i8> undef to <vscale x 8 x i1>
- %trunc_nxv8i16_to_i1 = trunc <vscale x 8 x i16> undef to <vscale x 8 x i1>
- %trunc_nxv8i32_to_i1 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i1>
- %trunc_nxv8i64_to_i1 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i1>
+ %trunc_nxv8i8_to_i1 = trunc <vscale x 8 x i8> poison to <vscale x 8 x i1>
+ %trunc_nxv8i16_to_i1 = trunc <vscale x 8 x i16> poison to <vscale x 8 x i1>
+ %trunc_nxv8i32_to_i1 = trunc <vscale x 8 x i32> poison to <vscale x 8 x i1>
+ %trunc_nxv8i64_to_i1 = trunc <vscale x 8 x i64> poison to <vscale x 8 x i1>
; Truncates to unpacked or legal types with vscale x 2 elements
- %trunc_nxv2i16_to_i8 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i8>
- %trunc_nxv2i32_to_i8 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i8>
- %trunc_nxv2i64_to_i8 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i8>
- %trunc_nxv2i32_to_i16 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i16>
- %trunc_nxv2i64_to_i16 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i16>
- %trunc_nxv2i64_to_i32 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i32>
+ %trunc_nxv2i16_to_i8 = trunc <vscale x 2 x i16> poison to <vscale x 2 x i8>
+ %trunc_nxv2i32_to_i8 = trunc <vscale x 2 x i32> poison to <vscale x 2 x i8>
+ %trunc_nxv2i64_to_i8 = trunc <vscale x 2 x i64> poison to <vscale x 2 x i8>
+ %trunc_nxv2i32_to_i16 = trunc <vscale x 2 x i32> poison to <vscale x 2 x i16>
+ %trunc_nxv2i64_to_i16 = trunc <vscale x 2 x i64> poison to <vscale x 2 x i16>
+ %trunc_nxv2i64_to_i32 = trunc <vscale x 2 x i64> poison to <vscale x 2 x i32>
; Truncates to unpacked or legal with vscale x 4 elements
- %trunc_nxv4i16_to_i8 = trunc <vscale x 4 x i16> undef to <vscale x 4 x i8>
- %trunc_nxv4i32_to_i8 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i8>
- %trunc_nxv4i64_to_i8 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i8>
- %trunc_nxv4i32_to_i16 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i16>
- %trunc_nxv4i64_to_i16 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i16>
- %trunc_nxv4i64_to_i32 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i32>
+ %trunc_nxv4i16_to_i8 = trunc <vscale x 4 x i16> poison to <vscale x 4 x i8>
+ %trunc_nxv4i32_to_i8 = trunc <vscale x 4 x i32> poison to <vscale x 4 x i8>
+ %trunc_nxv4i64_to_i8 = trunc <vscale x 4 x i64> poison to <vscale x 4 x i8>
+ %trunc_nxv4i32_to_i16 = trunc <vscale x 4 x i32> poison to <vscale x 4 x i16>
+ %trunc_nxv4i64_to_i16 = trunc <vscale x 4 x i64> poison to <vscale x 4 x i16>
+ %trunc_nxv4i64_to_i32 = trunc <vscale x 4 x i64> poison to <vscale x 4 x i32>
; Truncates to unpacked or legal with vscale x 8 elements
- %trunc_nxv8i16_to_i8 = trunc <vscale x 8 x i16> undef to <vscale x 8 x i8>
- %trunc_nxv8i32_to_i8 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i8>
- %trunc_nxv8i64_to_i8 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i8>
- %trunc_nxv8i32_to_i16 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i16>
- %trunc_nxv8i64_to_i16 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i16>
+ %trunc_nxv8i16_to_i8 = trunc <vscale x 8 x i16> poison to <vscale x 8 x i8>
+ %trunc_nxv8i32_to_i8 = trunc <vscale x 8 x i32> poison to <vscale x 8 x i8>
+ %trunc_nxv8i64_to_i8 = trunc <vscale x 8 x i64> poison to <vscale x 8 x i8>
+ %trunc_nxv8i32_to_i16 = trunc <vscale x 8 x i32> poison to <vscale x 8 x i16>
+ %trunc_nxv8i64_to_i16 = trunc <vscale x 8 x i64> poison to <vscale x 8 x i16>
; Truncates to unpacked or legal with vscale x 16 elements
- %trunc_nxv16i16_to_i8 = trunc <vscale x 16 x i16> undef to <vscale x 16 x i8>
- %trunc_nxv16i32_to_i8 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i8>
- %trunc_nxv16i64_to_i8 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i8>
+ %trunc_nxv16i16_to_i8 = trunc <vscale x 16 x i16> poison to <vscale x 16 x i8>
+ %trunc_nxv16i32_to_i8 = trunc <vscale x 16 x i32> poison to <vscale x 16 x i8>
+ %trunc_nxv16i64_to_i8 = trunc <vscale x 16 x i64> poison to <vscale x 16 x i8>
ret void
}
diff --git a/llvm/test/Analysis/DXILResource/buffer-frombinding.ll b/llvm/test/Analysis/DXILResource/buffer-frombinding.ll
index aeeb21e..ab1945d 100644
--- a/llvm/test/Analysis/DXILResource/buffer-frombinding.ll
+++ b/llvm/test/Analysis/DXILResource/buffer-frombinding.ll
@@ -150,7 +150,7 @@ define void @test_typedbuffer() {
; CHECK: Kind: CBuffer
; CHECK: CBuffer size: 4
- %cb1 = call target("dx.CBuffer", target("dx.Layout", {float}, 4, 0))
+ %cb1 = call target("dx.CBuffer", <{ [2 x <{ float, target("dx.Padding", 12) }>], float }>)
@llvm.dx.resource.handlefrombinding(i32 1, i32 8, i32 1, i32 0, ptr @Constants.str)
; CHECK: Resource [[CB1:[0-9]+]]:
; CHECK: Name: Constants
@@ -161,7 +161,7 @@ define void @test_typedbuffer() {
; CHECK: Size: 1
; CHECK: Class: CBV
; CHECK: Kind: CBuffer
- ; CHECK: CBuffer size: 4
+ ; CHECK: CBuffer size: 36
; CHECK-NOT: Resource {{[0-9]+}}:
diff --git a/llvm/test/Analysis/DependenceAnalysis/becount-couldnotcompute.ll b/llvm/test/Analysis/DependenceAnalysis/becount-couldnotcompute.ll
new file mode 100644
index 0000000..49fbad3
--- /dev/null
+++ b/llvm/test/Analysis/DependenceAnalysis/becount-couldnotcompute.ll
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -disable-output "-passes=print<da>" -aa-pipeline=basic-aa 2>&1 | FileCheck %s
+
+; Test for function isKnownLessThan that calculates a back-edge taken count,
+; which can return a CouldNotCompute SCEV.
+
+define void @test(i64 %conv, ptr %a) {
+; CHECK-LABEL: 'test'
+; CHECK-NEXT: Src: %ld = load i32, ptr %arrayidx12, align 4 --> Dst: %ld = load i32, ptr %arrayidx12, align 4
+; CHECK-NEXT: da analyze - none!
+;
+entry:
+ %sub = add i64 %conv, 1
+ br label %loop
+
+loop:
+ %i = phi i64 [ %add26, %loop ], [ 0, %entry ]
+ %arrayidx12 = getelementptr i32, ptr %a, i64 %i
+ %ld = load i32, ptr %arrayidx12, align 4
+ %add26 = add nsw i64 %sub, %i
+ br label %loop
+}
diff --git a/llvm/test/Analysis/ScalarEvolution/ne-guard-multiple-trip-count.ll b/llvm/test/Analysis/ScalarEvolution/ne-guard-multiple-trip-count.ll
new file mode 100644
index 0000000..220c5a1
--- /dev/null
+++ b/llvm/test/Analysis/ScalarEvolution/ne-guard-multiple-trip-count.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='print<scalar-evolution>' -disable-output %s 2>&1 | FileCheck %s
+
+declare void @foo()
+
+; Tests with multiple guards for the same value and different values.
+
+define void @test_guard_order_b_then_c_and_d(ptr %a, ptr %b, ptr %c, ptr %d) {
+; CHECK-LABEL: 'test_guard_order_b_then_c_and_d'
+; CHECK-NEXT: Classifying expressions for: @test_guard_order_b_then_c_and_d
+; CHECK-NEXT: %iv = phi ptr [ %a, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT: --> {%a,+,1}<%loop> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64) + %a) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT: %iv.next = getelementptr i8, ptr %iv, i64 1
+; CHECK-NEXT: --> {(1 + %a),+,1}<%loop> U: full-set S: full-set Exits: ((-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64) + %a) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT: Determining loop execution counts for: @test_guard_order_b_then_c_and_d
+; CHECK-NEXT: Loop %loop: backedge-taken count is (-1 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64))
+; CHECK-NEXT: Loop %loop: constant max backedge-taken count is i64 -2
+; CHECK-NEXT: Loop %loop: symbolic max backedge-taken count is (-1 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64))
+; CHECK-NEXT: Loop %loop: Trip multiple is 1
+;
+entry:
+ %cmp.eq.b = icmp ne ptr %a, %b
+ %cmp.eq.c = icmp ne ptr %a, %c
+ %cmp.eq.d = icmp ne ptr %b, %d
+ call void @llvm.assume(i1 %cmp.eq.b)
+ call void @llvm.assume(i1 %cmp.eq.c)
+ call void @llvm.assume(i1 %cmp.eq.d)
+ br label %loop
+
+loop:
+ %iv = phi ptr [ %a, %entry ], [ %iv.next, %loop ]
+ %iv.next = getelementptr i8, ptr %iv, i64 1
+ call void @foo()
+ %ec = icmp eq ptr %iv.next, %b
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @test_guard_order_d_then_c_and_b(ptr %a, ptr %b, ptr %c, ptr %d) {
+; CHECK-LABEL: 'test_guard_order_d_then_c_and_b'
+; CHECK-NEXT: Classifying expressions for: @test_guard_order_d_then_c_and_b
+; CHECK-NEXT: %iv = phi ptr [ %a, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT: --> {%a,+,1}<%loop> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64) + %a) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT: %iv.next = getelementptr i8, ptr %iv, i64 1
+; CHECK-NEXT: --> {(1 + %a),+,1}<%loop> U: full-set S: full-set Exits: ((-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64) + %a) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT: Determining loop execution counts for: @test_guard_order_d_then_c_and_b
+; CHECK-NEXT: Loop %loop: backedge-taken count is (-1 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64))
+; CHECK-NEXT: Loop %loop: constant max backedge-taken count is i64 -2
+; CHECK-NEXT: Loop %loop: symbolic max backedge-taken count is (-1 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64))
+; CHECK-NEXT: Loop %loop: Trip multiple is 1
+;
+entry:
+ %cmp.eq.b = icmp ne ptr %a, %b
+ %cmp.eq.c = icmp ne ptr %a, %c
+ %cmp.eq.d = icmp ne ptr %b, %d
+ call void @llvm.assume(i1 %cmp.eq.d)
+ call void @llvm.assume(i1 %cmp.eq.c)
+ call void @llvm.assume(i1 %cmp.eq.b)
+ br label %loop
+
+loop:
+ %iv = phi ptr [ %a, %entry ], [ %iv.next, %loop ]
+ %iv.next = getelementptr i8, ptr %iv, i64 1
+ call void @foo()
+ %ec = icmp eq ptr %iv.next, %b
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
diff --git a/llvm/test/Analysis/ScalarEvolution/ptrtoaddr.ll b/llvm/test/Analysis/ScalarEvolution/ptrtoaddr.ll
new file mode 100644
index 0000000..ebab9f0
--- /dev/null
+++ b/llvm/test/Analysis/ScalarEvolution/ptrtoaddr.ll
@@ -0,0 +1,135 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s --data-layout="e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" -S -disable-output -disable-verify "-passes=print<scalar-evolution>" 2>&1 | FileCheck --check-prefixes=ALL,X64 %s
+; RUN: opt < %s --data-layout="e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-f64:32:64-f80:32-n8:16:32-S128" -S -disable-output -disable-verify "-passes=print<scalar-evolution>" 2>&1 | FileCheck --check-prefixes=ALL,X32 %s
+
+declare void @useptr(ptr)
+
+define void @ptrtoaddr(ptr %in, ptr %out0, ptr %out1, ptr %out2, ptr %out3) {
+; X64-LABEL: 'ptrtoaddr'
+; X64-NEXT: Classifying expressions for: @ptrtoaddr
+; X64-NEXT: %p0 = ptrtoaddr ptr %in to i64
+; X64-NEXT: --> %p0 U: full-set S: full-set
+; X64-NEXT: %p1 = ptrtoaddr ptr %in to i32
+; X64-NEXT: --> %p1 U: full-set S: full-set
+; X64-NEXT: %p2 = ptrtoaddr ptr %in to i16
+; X64-NEXT: --> %p2 U: full-set S: full-set
+; X64-NEXT: %p3 = ptrtoaddr ptr %in to i128
+; X64-NEXT: --> %p3 U: full-set S: full-set
+; X64-NEXT: Determining loop execution counts for: @ptrtoaddr
+;
+; X32-LABEL: 'ptrtoaddr'
+; X32-NEXT: Classifying expressions for: @ptrtoaddr
+; X32-NEXT: %p0 = ptrtoaddr ptr %in to i64
+; X32-NEXT: --> %p0 U: full-set S: full-set
+; X32-NEXT: %p1 = ptrtoaddr ptr %in to i32
+; X32-NEXT: --> %p1 U: full-set S: full-set
+; X32-NEXT: %p2 = ptrtoaddr ptr %in to i16
+; X32-NEXT: --> %p2 U: full-set S: full-set
+; X32-NEXT: %p3 = ptrtoaddr ptr %in to i128
+; X32-NEXT: --> %p3 U: full-set S: full-set
+; X32-NEXT: Determining loop execution counts for: @ptrtoaddr
+;
+ %p0 = ptrtoaddr ptr %in to i64
+ %p1 = ptrtoaddr ptr %in to i32
+ %p2 = ptrtoaddr ptr %in to i16
+ %p3 = ptrtoaddr ptr %in to i128
+ store i64 %p0, ptr %out0
+ store i32 %p1, ptr %out1
+ store i16 %p2, ptr %out2
+ store i128 %p3, ptr %out3
+ ret void
+}
+
+define void @ptrtoaddr_as1(ptr addrspace(1) %in, ptr %out0, ptr %out1, ptr %out2, ptr %out3) {
+; X64-LABEL: 'ptrtoaddr_as1'
+; X64-NEXT: Classifying expressions for: @ptrtoaddr_as1
+; X64-NEXT: %p0 = ptrtoaddr ptr addrspace(1) %in to i64
+; X64-NEXT: --> %p0 U: full-set S: full-set
+; X64-NEXT: %p1 = ptrtoaddr ptr addrspace(1) %in to i32
+; X64-NEXT: --> %p1 U: full-set S: full-set
+; X64-NEXT: %p2 = ptrtoaddr ptr addrspace(1) %in to i16
+; X64-NEXT: --> %p2 U: full-set S: full-set
+; X64-NEXT: %p3 = ptrtoaddr ptr addrspace(1) %in to i128
+; X64-NEXT: --> %p3 U: full-set S: full-set
+; X64-NEXT: Determining loop execution counts for: @ptrtoaddr_as1
+;
+; X32-LABEL: 'ptrtoaddr_as1'
+; X32-NEXT: Classifying expressions for: @ptrtoaddr_as1
+; X32-NEXT: %p0 = ptrtoaddr ptr addrspace(1) %in to i64
+; X32-NEXT: --> %p0 U: full-set S: full-set
+; X32-NEXT: %p1 = ptrtoaddr ptr addrspace(1) %in to i32
+; X32-NEXT: --> %p1 U: full-set S: full-set
+; X32-NEXT: %p2 = ptrtoaddr ptr addrspace(1) %in to i16
+; X32-NEXT: --> %p2 U: full-set S: full-set
+; X32-NEXT: %p3 = ptrtoaddr ptr addrspace(1) %in to i128
+; X32-NEXT: --> %p3 U: full-set S: full-set
+; X32-NEXT: Determining loop execution counts for: @ptrtoaddr_as1
+;
+ %p0 = ptrtoaddr ptr addrspace(1) %in to i64
+ %p1 = ptrtoaddr ptr addrspace(1) %in to i32
+ %p2 = ptrtoaddr ptr addrspace(1) %in to i16
+ %p3 = ptrtoaddr ptr addrspace(1) %in to i128
+ store i64 %p0, ptr %out0
+ store i32 %p1, ptr %out1
+ store i16 %p2, ptr %out2
+ store i128 %p3, ptr %out3
+ ret void
+}
+
+define void @ptrtoaddr_of_bitcast(ptr %in, ptr %out0) {
+; X64-LABEL: 'ptrtoaddr_of_bitcast'
+; X64-NEXT: Classifying expressions for: @ptrtoaddr_of_bitcast
+; X64-NEXT: %in_casted = bitcast ptr %in to ptr
+; X64-NEXT: --> %in U: full-set S: full-set
+; X64-NEXT: %p0 = ptrtoaddr ptr %in_casted to i64
+; X64-NEXT: --> %p0 U: full-set S: full-set
+; X64-NEXT: Determining loop execution counts for: @ptrtoaddr_of_bitcast
+;
+; X32-LABEL: 'ptrtoaddr_of_bitcast'
+; X32-NEXT: Classifying expressions for: @ptrtoaddr_of_bitcast
+; X32-NEXT: %in_casted = bitcast ptr %in to ptr
+; X32-NEXT: --> %in U: full-set S: full-set
+; X32-NEXT: %p0 = ptrtoaddr ptr %in_casted to i64
+; X32-NEXT: --> %p0 U: full-set S: full-set
+; X32-NEXT: Determining loop execution counts for: @ptrtoaddr_of_bitcast
+;
+ %in_casted = bitcast ptr %in to ptr
+ %p0 = ptrtoaddr ptr %in_casted to i64
+ store i64 %p0, ptr %out0
+ ret void
+}
+
+define void @ptrtoaddr_of_nullptr(ptr %out0) {
+; ALL-LABEL: 'ptrtoaddr_of_nullptr'
+; ALL-NEXT: Classifying expressions for: @ptrtoaddr_of_nullptr
+; ALL-NEXT: %p0 = ptrtoaddr ptr null to i64
+; ALL-NEXT: --> %p0 U: full-set S: full-set
+; ALL-NEXT: Determining loop execution counts for: @ptrtoaddr_of_nullptr
+;
+ %p0 = ptrtoaddr ptr null to i64
+ store i64 %p0, ptr %out0
+ ret void
+}
+
+define void @ptrtoaddr_of_gep(ptr %in, ptr %out0) {
+; X64-LABEL: 'ptrtoaddr_of_gep'
+; X64-NEXT: Classifying expressions for: @ptrtoaddr_of_gep
+; X64-NEXT: %in_adj = getelementptr inbounds i8, ptr %in, i64 42
+; X64-NEXT: --> (42 + %in) U: full-set S: full-set
+; X64-NEXT: %p0 = ptrtoaddr ptr %in_adj to i64
+; X64-NEXT: --> %p0 U: full-set S: full-set
+; X64-NEXT: Determining loop execution counts for: @ptrtoaddr_of_gep
+;
+; X32-LABEL: 'ptrtoaddr_of_gep'
+; X32-NEXT: Classifying expressions for: @ptrtoaddr_of_gep
+; X32-NEXT: %in_adj = getelementptr inbounds i8, ptr %in, i64 42
+; X32-NEXT: --> (42 + %in) U: full-set S: full-set
+; X32-NEXT: %p0 = ptrtoaddr ptr %in_adj to i64
+; X32-NEXT: --> %p0 U: full-set S: full-set
+; X32-NEXT: Determining loop execution counts for: @ptrtoaddr_of_gep
+;
+ %in_adj = getelementptr inbounds i8, ptr %in, i64 42
+ %p0 = ptrtoaddr ptr %in_adj to i64
+ store i64 %p0, ptr %out0
+ ret void
+}
diff --git a/llvm/test/Analysis/ScalarEvolution/ptrtoint.ll b/llvm/test/Analysis/ScalarEvolution/ptrtoint.ll
index e784d25..0c1f37b 100644
--- a/llvm/test/Analysis/ScalarEvolution/ptrtoint.ll
+++ b/llvm/test/Analysis/ScalarEvolution/ptrtoint.ll
@@ -382,7 +382,7 @@ define void @pr46786_c26_char(ptr %arg, ptr %arg1, ptr %arg2) {
; X64-NEXT: %i9 = ptrtoint ptr %i7 to i64
; X64-NEXT: --> {(ptrtoint ptr %arg to i64),+,1}<nuw><%bb6> U: full-set S: full-set Exits: (-1 + (ptrtoint ptr %arg1 to i64)) LoopDispositions: { %bb6: Computable }
; X64-NEXT: %i10 = sub i64 %i9, %i4
-; X64-NEXT: --> {0,+,1}<nuw><%bb6> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64)) LoopDispositions: { %bb6: Computable }
+; X64-NEXT: --> {0,+,1}<nuw><%bb6> U: [0,-1) S: [0,-1) Exits: (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64)) LoopDispositions: { %bb6: Computable }
; X64-NEXT: %i11 = getelementptr inbounds i8, ptr %arg2, i64 %i10
; X64-NEXT: --> {%arg2,+,1}<nw><%bb6> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64) + %arg2) LoopDispositions: { %bb6: Computable }
; X64-NEXT: %i12 = load i8, ptr %i11, align 1
@@ -393,7 +393,7 @@ define void @pr46786_c26_char(ptr %arg, ptr %arg1, ptr %arg2) {
; X64-NEXT: --> {(1 + %arg),+,1}<nuw><%bb6> U: full-set S: full-set Exits: ((-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64) + %arg) LoopDispositions: { %bb6: Computable }
; X64-NEXT: Determining loop execution counts for: @pr46786_c26_char
; X64-NEXT: Loop %bb6: backedge-taken count is (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64))
-; X64-NEXT: Loop %bb6: constant max backedge-taken count is i64 -1
+; X64-NEXT: Loop %bb6: constant max backedge-taken count is i64 -2
; X64-NEXT: Loop %bb6: symbolic max backedge-taken count is (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64))
; X64-NEXT: Loop %bb6: Trip multiple is 1
;
@@ -406,9 +406,9 @@ define void @pr46786_c26_char(ptr %arg, ptr %arg1, ptr %arg2) {
; X32-NEXT: %i8 = load i8, ptr %i7, align 1
; X32-NEXT: --> %i8 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %bb6: Variant }
; X32-NEXT: %i9 = ptrtoint ptr %i7 to i64
-; X32-NEXT: --> {(zext i32 (ptrtoint ptr %arg to i32) to i64),+,1}<nuw><%bb6> U: [0,8589934591) S: [0,8589934591) Exits: ((zext i32 (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32)) to i64) + (zext i32 (ptrtoint ptr %arg to i32) to i64)) LoopDispositions: { %bb6: Computable }
+; X32-NEXT: --> {(zext i32 (ptrtoint ptr %arg to i32) to i64),+,1}<nuw><%bb6> U: [0,8589934590) S: [0,8589934590) Exits: ((zext i32 (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32)) to i64) + (zext i32 (ptrtoint ptr %arg to i32) to i64)) LoopDispositions: { %bb6: Computable }
; X32-NEXT: %i10 = sub i64 %i9, %i4
-; X32-NEXT: --> {0,+,1}<nuw><%bb6> U: [0,4294967296) S: [0,4294967296) Exits: (zext i32 (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32)) to i64) LoopDispositions: { %bb6: Computable }
+; X32-NEXT: --> {0,+,1}<nuw><%bb6> U: [0,4294967295) S: [0,4294967295) Exits: (zext i32 (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32)) to i64) LoopDispositions: { %bb6: Computable }
; X32-NEXT: %i11 = getelementptr inbounds i8, ptr %arg2, i64 %i10
; X32-NEXT: --> {%arg2,+,1}<%bb6> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32) + %arg2) LoopDispositions: { %bb6: Computable }
; X32-NEXT: %i12 = load i8, ptr %i11, align 1
@@ -419,7 +419,7 @@ define void @pr46786_c26_char(ptr %arg, ptr %arg1, ptr %arg2) {
; X32-NEXT: --> {(1 + %arg),+,1}<nuw><%bb6> U: full-set S: full-set Exits: ((-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32) + %arg) LoopDispositions: { %bb6: Computable }
; X32-NEXT: Determining loop execution counts for: @pr46786_c26_char
; X32-NEXT: Loop %bb6: backedge-taken count is (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32))
-; X32-NEXT: Loop %bb6: constant max backedge-taken count is i32 -1
+; X32-NEXT: Loop %bb6: constant max backedge-taken count is i32 -2
; X32-NEXT: Loop %bb6: symbolic max backedge-taken count is (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32))
; X32-NEXT: Loop %bb6: Trip multiple is 1
;
@@ -447,6 +447,84 @@ bb5:
ret void
}
+define void @pr46786_c26_char_cmp_ops_swapped(ptr %arg, ptr %arg1, ptr %arg2) {
+; X64-LABEL: 'pr46786_c26_char_cmp_ops_swapped'
+; X64-NEXT: Classifying expressions for: @pr46786_c26_char_cmp_ops_swapped
+; X64-NEXT: %i4 = ptrtoint ptr %arg to i64
+; X64-NEXT: --> (ptrtoint ptr %arg to i64) U: full-set S: full-set
+; X64-NEXT: %i7 = phi ptr [ %arg, %bb3 ], [ %i14, %bb6 ]
+; X64-NEXT: --> {%arg,+,1}<nuw><%bb6> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64) + %arg) LoopDispositions: { %bb6: Computable }
+; X64-NEXT: %i8 = load i8, ptr %i7, align 1
+; X64-NEXT: --> %i8 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %bb6: Variant }
+; X64-NEXT: %i9 = ptrtoint ptr %i7 to i64
+; X64-NEXT: --> {(ptrtoint ptr %arg to i64),+,1}<nuw><%bb6> U: full-set S: full-set Exits: (-1 + (ptrtoint ptr %arg1 to i64)) LoopDispositions: { %bb6: Computable }
+; X64-NEXT: %i10 = sub i64 %i9, %i4
+; X64-NEXT: --> {0,+,1}<nuw><%bb6> U: [0,-1) S: [0,-1) Exits: (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64)) LoopDispositions: { %bb6: Computable }
+; X64-NEXT: %i11 = getelementptr inbounds i8, ptr %arg2, i64 %i10
+; X64-NEXT: --> {%arg2,+,1}<nw><%bb6> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64) + %arg2) LoopDispositions: { %bb6: Computable }
+; X64-NEXT: %i12 = load i8, ptr %i11, align 1
+; X64-NEXT: --> %i12 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %bb6: Variant }
+; X64-NEXT: %i13 = add i8 %i12, %i8
+; X64-NEXT: --> (%i12 + %i8) U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %bb6: Variant }
+; X64-NEXT: %i14 = getelementptr inbounds i8, ptr %i7, i64 1
+; X64-NEXT: --> {(1 + %arg),+,1}<nuw><%bb6> U: full-set S: full-set Exits: ((-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64) + %arg) LoopDispositions: { %bb6: Computable }
+; X64-NEXT: Determining loop execution counts for: @pr46786_c26_char_cmp_ops_swapped
+; X64-NEXT: Loop %bb6: backedge-taken count is (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64))
+; X64-NEXT: Loop %bb6: constant max backedge-taken count is i64 -2
+; X64-NEXT: Loop %bb6: symbolic max backedge-taken count is (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64))
+; X64-NEXT: Loop %bb6: Trip multiple is 1
+;
+; X32-LABEL: 'pr46786_c26_char_cmp_ops_swapped'
+; X32-NEXT: Classifying expressions for: @pr46786_c26_char_cmp_ops_swapped
+; X32-NEXT: %i4 = ptrtoint ptr %arg to i64
+; X32-NEXT: --> (zext i32 (ptrtoint ptr %arg to i32) to i64) U: [0,4294967296) S: [0,4294967296)
+; X32-NEXT: %i7 = phi ptr [ %arg, %bb3 ], [ %i14, %bb6 ]
+; X32-NEXT: --> {%arg,+,1}<nuw><%bb6> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32) + %arg) LoopDispositions: { %bb6: Computable }
+; X32-NEXT: %i8 = load i8, ptr %i7, align 1
+; X32-NEXT: --> %i8 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %bb6: Variant }
+; X32-NEXT: %i9 = ptrtoint ptr %i7 to i64
+; X32-NEXT: --> {(zext i32 (ptrtoint ptr %arg to i32) to i64),+,1}<nuw><%bb6> U: [0,8589934590) S: [0,8589934590) Exits: ((zext i32 (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32)) to i64) + (zext i32 (ptrtoint ptr %arg to i32) to i64)) LoopDispositions: { %bb6: Computable }
+; X32-NEXT: %i10 = sub i64 %i9, %i4
+; X32-NEXT: --> {0,+,1}<nuw><%bb6> U: [0,4294967295) S: [0,4294967295) Exits: (zext i32 (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32)) to i64) LoopDispositions: { %bb6: Computable }
+; X32-NEXT: %i11 = getelementptr inbounds i8, ptr %arg2, i64 %i10
+; X32-NEXT: --> {%arg2,+,1}<%bb6> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32) + %arg2) LoopDispositions: { %bb6: Computable }
+; X32-NEXT: %i12 = load i8, ptr %i11, align 1
+; X32-NEXT: --> %i12 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %bb6: Variant }
+; X32-NEXT: %i13 = add i8 %i12, %i8
+; X32-NEXT: --> (%i12 + %i8) U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %bb6: Variant }
+; X32-NEXT: %i14 = getelementptr inbounds i8, ptr %i7, i64 1
+; X32-NEXT: --> {(1 + %arg),+,1}<nuw><%bb6> U: full-set S: full-set Exits: ((-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32) + %arg) LoopDispositions: { %bb6: Computable }
+; X32-NEXT: Determining loop execution counts for: @pr46786_c26_char_cmp_ops_swapped
+; X32-NEXT: Loop %bb6: backedge-taken count is (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32))
+; X32-NEXT: Loop %bb6: constant max backedge-taken count is i32 -2
+; X32-NEXT: Loop %bb6: symbolic max backedge-taken count is (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32))
+; X32-NEXT: Loop %bb6: Trip multiple is 1
+;
+ %i = icmp eq ptr %arg1, %arg
+ br i1 %i, label %bb5, label %bb3
+
+bb3:
+ %i4 = ptrtoint ptr %arg to i64
+ br label %bb6
+
+bb6:
+ %i7 = phi ptr [ %arg, %bb3 ], [ %i14, %bb6 ]
+ %i8 = load i8, ptr %i7
+ %i9 = ptrtoint ptr %i7 to i64
+ %i10 = sub i64 %i9, %i4
+ %i11 = getelementptr inbounds i8, ptr %arg2, i64 %i10
+ %i12 = load i8, ptr %i11
+ %i13 = add i8 %i12, %i8
+ store i8 %i13, ptr %i11
+ %i14 = getelementptr inbounds i8, ptr %i7, i64 1
+ %i15 = icmp eq ptr %i14, %arg1
+ br i1 %i15, label %bb5, label %bb6
+
+bb5:
+ ret void
+}
+
+
; void pr46786_c26_int(int* start, int *end, int *other) {
; for (int* cur = start; cur != end; ++cur)
; other[cur - start] += *cur;
diff --git a/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll b/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll
index 7ba422d..a477465c 100644
--- a/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll
+++ b/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll
@@ -578,22 +578,22 @@ define void @test_ptr_aligned_by_2_and_4_via_assumption(ptr %start, ptr %end) {
; CHECK-LABEL: 'test_ptr_aligned_by_2_and_4_via_assumption'
; CHECK-NEXT: Classifying expressions for: @test_ptr_aligned_by_2_and_4_via_assumption
; CHECK-NEXT: %iv = phi ptr [ %start, %entry ], [ %iv.next, %loop ]
-; CHECK-NEXT: --> {%start,+,4}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT: --> {%start,+,4}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
; CHECK-NEXT: %iv.next = getelementptr i8, ptr %iv, i64 4
-; CHECK-NEXT: --> {(4 + %start),+,4}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT: --> {(4 + %start),+,4}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
; CHECK-NEXT: Determining loop execution counts for: @test_ptr_aligned_by_2_and_4_via_assumption
; CHECK-NEXT: Loop %loop: Unpredictable backedge-taken count.
; CHECK-NEXT: Loop %loop: Unpredictable constant max backedge-taken count.
; CHECK-NEXT: Loop %loop: Unpredictable symbolic max backedge-taken count.
; CHECK-NEXT: Loop %loop: Predicated backedge-taken count is ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4)
; CHECK-NEXT: Predicates:
-; CHECK-NEXT: Equal predicate: (zext i2 ((trunc i64 (ptrtoint ptr %end to i64) to i2) + (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2))) to i64) == 0
+; CHECK-NEXT: Equal predicate: (zext i2 (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2)) to i64) == 0
; CHECK-NEXT: Loop %loop: Predicated constant max backedge-taken count is i64 4611686018427387903
; CHECK-NEXT: Predicates:
-; CHECK-NEXT: Equal predicate: (zext i2 ((trunc i64 (ptrtoint ptr %end to i64) to i2) + (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2))) to i64) == 0
+; CHECK-NEXT: Equal predicate: (zext i2 (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2)) to i64) == 0
; CHECK-NEXT: Loop %loop: Predicated symbolic max backedge-taken count is ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4)
; CHECK-NEXT: Predicates:
-; CHECK-NEXT: Equal predicate: (zext i2 ((trunc i64 (ptrtoint ptr %end to i64) to i2) + (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2))) to i64) == 0
+; CHECK-NEXT: Equal predicate: (zext i2 (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2)) to i64) == 0
;
entry:
call void @llvm.assume(i1 true) [ "align"(ptr %start, i64 2) ]
@@ -615,9 +615,9 @@ define void @test_ptrs_aligned_by_4_via_assumption(ptr %start, ptr %end) {
; CHECK-LABEL: 'test_ptrs_aligned_by_4_via_assumption'
; CHECK-NEXT: Classifying expressions for: @test_ptrs_aligned_by_4_via_assumption
; CHECK-NEXT: %iv = phi ptr [ %start, %entry ], [ %iv.next, %loop ]
-; CHECK-NEXT: --> {%start,+,4}<%loop> U: full-set S: full-set Exits: ((4 * ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4))<nuw> + %start) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT: --> {%start,+,4}<%loop> U: [0,-3) S: [-9223372036854775808,9223372036854775805) Exits: (-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64) + %start) LoopDispositions: { %loop: Computable }
; CHECK-NEXT: %iv.next = getelementptr i8, ptr %iv, i64 4
-; CHECK-NEXT: --> {(4 + %start),+,4}<%loop> U: full-set S: full-set Exits: (4 + (4 * ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4))<nuw> + %start) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT: --> {(4 + %start),+,4}<%loop> U: [0,-3) S: [-9223372036854775808,9223372036854775805) Exits: ((-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64) + %start) LoopDispositions: { %loop: Computable }
; CHECK-NEXT: Determining loop execution counts for: @test_ptrs_aligned_by_4_via_assumption
; CHECK-NEXT: Loop %loop: backedge-taken count is ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4)
; CHECK-NEXT: Loop %loop: constant max backedge-taken count is i64 4611686018427387903
@@ -644,9 +644,9 @@ define void @test_ptrs_aligned_by_8_via_assumption(ptr %start, ptr %end) {
; CHECK-LABEL: 'test_ptrs_aligned_by_8_via_assumption'
; CHECK-NEXT: Classifying expressions for: @test_ptrs_aligned_by_8_via_assumption
; CHECK-NEXT: %iv = phi ptr [ %start, %entry ], [ %iv.next, %loop ]
-; CHECK-NEXT: --> {%start,+,4}<%loop> U: full-set S: full-set Exits: ((4 * ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4))<nuw> + %start) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT: --> {%start,+,4}<%loop> U: [0,-3) S: [-9223372036854775808,9223372036854775805) Exits: (-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64) + %start) LoopDispositions: { %loop: Computable }
; CHECK-NEXT: %iv.next = getelementptr i8, ptr %iv, i64 4
-; CHECK-NEXT: --> {(4 + %start),+,4}<%loop> U: full-set S: full-set Exits: (4 + (4 * ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4))<nuw> + %start) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT: --> {(4 + %start)<nuw><nsw>,+,4}<%loop> U: [0,-3) S: [-9223372036854775808,9223372036854775805) Exits: ((-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64) + %start) LoopDispositions: { %loop: Computable }
; CHECK-NEXT: Determining loop execution counts for: @test_ptrs_aligned_by_8_via_assumption
; CHECK-NEXT: Loop %loop: backedge-taken count is ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4)
; CHECK-NEXT: Loop %loop: constant max backedge-taken count is i64 4611686018427387903
@@ -677,22 +677,22 @@ define void @test_ptr_aligned_by_4_via_assumption_multiple_loop_predecessors(ptr
; CHECK-NEXT: %c = call i1 @cond()
; CHECK-NEXT: --> %c U: full-set S: full-set
; CHECK-NEXT: %iv = phi ptr [ %start, %then ], [ %start, %else ], [ %iv.next, %loop ]
-; CHECK-NEXT: --> {%start,+,4}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT: --> {%start,+,4}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
; CHECK-NEXT: %iv.next = getelementptr i8, ptr %iv, i64 4
-; CHECK-NEXT: --> {(4 + %start),+,4}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT: --> {(4 + %start),+,4}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
; CHECK-NEXT: Determining loop execution counts for: @test_ptr_aligned_by_4_via_assumption_multiple_loop_predecessors
; CHECK-NEXT: Loop %loop: Unpredictable backedge-taken count.
; CHECK-NEXT: Loop %loop: Unpredictable constant max backedge-taken count.
; CHECK-NEXT: Loop %loop: Unpredictable symbolic max backedge-taken count.
; CHECK-NEXT: Loop %loop: Predicated backedge-taken count is ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4)
; CHECK-NEXT: Predicates:
-; CHECK-NEXT: Equal predicate: (zext i2 ((trunc i64 (ptrtoint ptr %end to i64) to i2) + (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2))) to i64) == 0
+; CHECK-NEXT: Equal predicate: (zext i2 (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2)) to i64) == 0
; CHECK-NEXT: Loop %loop: Predicated constant max backedge-taken count is i64 4611686018427387903
; CHECK-NEXT: Predicates:
-; CHECK-NEXT: Equal predicate: (zext i2 ((trunc i64 (ptrtoint ptr %end to i64) to i2) + (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2))) to i64) == 0
+; CHECK-NEXT: Equal predicate: (zext i2 (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2)) to i64) == 0
; CHECK-NEXT: Loop %loop: Predicated symbolic max backedge-taken count is ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4)
; CHECK-NEXT: Predicates:
-; CHECK-NEXT: Equal predicate: (zext i2 ((trunc i64 (ptrtoint ptr %end to i64) to i2) + (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2))) to i64) == 0
+; CHECK-NEXT: Equal predicate: (zext i2 (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2)) to i64) == 0
;
entry:
call void @llvm.assume(i1 true) [ "align"(ptr %start, i64 2) ]
diff --git a/llvm/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll b/llvm/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll
index 7ec674a..dc4a72e 100644
--- a/llvm/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll
+++ b/llvm/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
; RUN: opt -aa-pipeline=tbaa,basic-aa -passes=gvn -S < %s | FileCheck %s
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
@@ -5,12 +6,15 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-
; TBAA should prove that these calls don't interfere, since they are
; IntrArgReadMem and have TBAA metadata.
-; CHECK: define <8 x i16> @test0(ptr %p, ptr %q, <8 x i16> %y, <8 x i1> %m, <8 x i16> %pt) {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: %a = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %p, i32 16, <8 x i1> %m, <8 x i16> %pt) [[NUW:#[0-9]+]]
-; CHECK-NEXT: call void @llvm.masked.store.v8i16.p0(<8 x i16> %y, ptr %q, i32 16, <8 x i1> %m)
-; CHECK-NEXT: %c = add <8 x i16> %a, %a
define <8 x i16> @test0(ptr %p, ptr %q, <8 x i16> %y, <8 x i1> %m, <8 x i16> %pt) {
+; CHECK-LABEL: define <8 x i16> @test0(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], <8 x i16> [[Y:%.*]], <8 x i1> [[M:%.*]], <8 x i16> [[PT:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[A:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[P]], i32 16, <8 x i1> [[M]], <8 x i16> [[PT]]) #[[ATTR2:[0-9]+]], !tbaa [[B_TBAA0:![0-9]+]]
+; CHECK-NEXT: call void @llvm.masked.store.v8i16.p0(<8 x i16> [[Y]], ptr [[Q]], i32 16, <8 x i1> [[M]]), !tbaa [[A_TBAA3:![0-9]+]]
+; CHECK-NEXT: [[C:%.*]] = add <8 x i16> [[A]], [[A]]
+; CHECK-NEXT: ret <8 x i16> [[C]]
+;
entry:
%a = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %p, i32 16, <8 x i1> %m, <8 x i16> %pt) nounwind, !tbaa !2
call void @llvm.masked.store.v8i16.p0(<8 x i16> %y, ptr %q, i32 16, <8 x i1> %m), !tbaa !1
@@ -24,10 +28,16 @@ declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32, <8 x i1>) nounwind
; CHECK: attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
; CHECK: attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) }
-; CHECK: attributes [[NUW]] = { nounwind }
!0 = !{!"tbaa root"}
!1 = !{!3, !3, i64 0}
!2 = !{!4, !4, i64 0}
!3 = !{!"A", !0}
!4 = !{!"B", !0}
+;.
+; CHECK: [[B_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"B", [[META2:![0-9]+]]}
+; CHECK: [[META2]] = !{!"tbaa root"}
+; CHECK: [[A_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+; CHECK: [[META4]] = !{!"A", [[META2]]}
+;.
diff --git a/llvm/test/Assembler/autoupgrade-lifetime-intrinsics.ll b/llvm/test/Assembler/autoupgrade-lifetime-intrinsics.ll
index 377c002..49174d2 100644
--- a/llvm/test/Assembler/autoupgrade-lifetime-intrinsics.ll
+++ b/llvm/test/Assembler/autoupgrade-lifetime-intrinsics.ll
@@ -56,6 +56,45 @@ define void @remove_unanalyzable(ptr %p) {
ret void
}
+define void @no_declaration() {
+; CHECK-LABEL: define void @no_declaration() {
+; CHECK-NEXT: [[A:%.*]] = alloca i8, align 1, addrspace(2)
+; CHECK-NEXT: call void @llvm.lifetime.start.p2(ptr addrspace(2) [[A]])
+; CHECK-NEXT: call void @llvm.lifetime.end.p2(ptr addrspace(2) [[A]])
+; CHECK-NEXT: ret void
+;
+ %a = alloca i8, addrspace(2)
+ call void @llvm.lifetime.start.p2(i64 1, ptr addrspace(2) %a)
+ call void @llvm.lifetime.end.p2(i64 1, ptr addrspace(2) %a)
+ ret void
+}
+
+define void @no_suffix1() {
+; CHECK-LABEL: define void @no_suffix1() {
+; CHECK-NEXT: [[A:%.*]] = alloca i8, align 1, addrspace(3)
+; CHECK-NEXT: call void @llvm.lifetime.start.p3(ptr addrspace(3) [[A]])
+; CHECK-NEXT: call void @llvm.lifetime.end.p3(ptr addrspace(3) [[A]])
+; CHECK-NEXT: ret void
+;
+ %a = alloca i8, addrspace(3)
+ call void @llvm.lifetime.start(i64 1, ptr addrspace(3) %a)
+ call void @llvm.lifetime.end(i64 1, ptr addrspace(3) %a)
+ ret void
+}
+
+define void @no_suffix2() {
+; CHECK-LABEL: define void @no_suffix2() {
+; CHECK-NEXT: [[A:%.*]] = alloca i8, align 1, addrspace(4)
+; CHECK-NEXT: call void @llvm.lifetime.start.p4(ptr addrspace(4) [[A]])
+; CHECK-NEXT: call void @llvm.lifetime.end.p4(ptr addrspace(4) [[A]])
+; CHECK-NEXT: ret void
+;
+ %a = alloca i8, addrspace(4)
+ call void @llvm.lifetime.start(i64 1, ptr addrspace(4) %a)
+ call void @llvm.lifetime.end(i64 1, ptr addrspace(4) %a)
+ ret void
+}
+
declare void @llvm.lifetime.start.p0(i64, ptr)
declare void @llvm.lifetime.end.p0(i64, ptr)
declare void @llvm.lifetime.start.p1(i64, ptr addrspace(1))
diff --git a/llvm/test/Assembler/autoupgrade-wasm-intrinsics.ll b/llvm/test/Assembler/autoupgrade-wasm-intrinsics.ll
index 012fa1d..e54efa4 100644
--- a/llvm/test/Assembler/autoupgrade-wasm-intrinsics.ll
+++ b/llvm/test/Assembler/autoupgrade-wasm-intrinsics.ll
@@ -46,7 +46,10 @@ define <4 x float> @test_fms(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
ret <4 x float> %res
}
-declare <16 x i8> @llvm.wasm.laneselect.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
+; This declaration is intentionally omitted to check that intrinsic upgrade
+; also works without a declaration.
+; declare <16 x i8> @llvm.wasm.laneselect.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
+
declare <8 x i16> @llvm.wasm.dot.i8x16.i7x16.signed(<16 x i8>, <16 x i8>)
declare <4 x i32> @llvm.wasm.dot.i8x16.i7x16.add.signed(<16 x i8>, <16 x i8>, <4 x i32>)
declare <4 x float> @llvm.wasm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
diff --git a/llvm/test/Assembler/dicompileunit-invalid-language-version.ll b/llvm/test/Assembler/dicompileunit-invalid-language-version.ll
new file mode 100644
index 0000000..b3794ac
--- /dev/null
+++ b/llvm/test/Assembler/dicompileunit-invalid-language-version.ll
@@ -0,0 +1,25 @@
+; RUN: split-file %s %t
+; RUN: not llvm-as < %t/dw_lang_with_version.ll -disable-output 2>&1 | FileCheck %s --check-prefix=WRONG-ATTR
+; RUN: not llvm-as < %t/overflow.ll -disable-output 2>&1 | FileCheck %s --check-prefix=OVERFLOW
+; RUN: not llvm-as < %t/version_without_name.ll -disable-output 2>&1 | FileCheck %s --check-prefix=NO-NAME
+; RUN: not llvm-as < %t/negative.ll -disable-output 2>&1 | FileCheck %s --check-prefix=NEGATIVE
+
+; WRONG-ATTR: error: 'sourceLanguageVersion' requires an associated 'sourceLanguageName' on !DICompileUnit
+; OVERFLOW: error: value for 'sourceLanguageVersion' too large, limit is 4294967295
+; NEGATIVE: error: expected unsigned integer
+; NO-NAME: error: missing one of 'language' or 'sourceLanguageName', required for !DICompileUnit
+
+;--- dw_lang_with_version.ll
+!0 = distinct !DICompileUnit(language: DW_LANG_C, sourceLanguageVersion: 1,
+ file: !DIFile(filename: "", directory: ""))
+
+;--- overflow.ll
+!0 = distinct !DICompileUnit(sourceLanguageName: DW_LNAME_C, sourceLanguageVersion: 4294967298)
+
+;--- negative.ll
+!0 = distinct !DICompileUnit(sourceLanguageName: DW_LNAME_C, sourceLanguageVersion: -1,
+ file: !DIFile(filename: "", directory: ""))
+
+;--- version_without_name.ll
+!0 = distinct !DICompileUnit(sourceLanguageVersion: 1,
+ file: !DIFile(filename: "", directory: ""))
diff --git a/llvm/test/Assembler/implicit-intrinsic-declaration-invalid3.ll b/llvm/test/Assembler/implicit-intrinsic-declaration-invalid3.ll
index ad5a96a..4caee57 100644
--- a/llvm/test/Assembler/implicit-intrinsic-declaration-invalid3.ll
+++ b/llvm/test/Assembler/implicit-intrinsic-declaration-invalid3.ll
@@ -2,7 +2,7 @@
; Use of unknown intrinsic without declaration should be rejected.
-; CHECK: error: use of undefined value '@llvm.foobar'
+; CHECK: error: unknown intrinsic 'llvm.foobar'
define void @test() {
call i8 @llvm.foobar(i8 0, i16 1)
ret void
diff --git a/llvm/test/Bindings/llvm-c/debug_info_new_format.ll b/llvm/test/Bindings/llvm-c/debug_info_new_format.ll
index 83b37da..75e5fa0 100644
--- a/llvm/test/Bindings/llvm-c/debug_info_new_format.ll
+++ b/llvm/test/Bindings/llvm-c/debug_info_new_format.ll
@@ -3,37 +3,37 @@
; CHECK: ; ModuleID = 'debuginfo.c'
; CHECK-NEXT: source_filename = "debuginfo.c"
-
-; CHECK: define i64 @foo(i64 %0, i64 %1, <10 x i64> %2) !dbg !44 {
+
+; CHECK: define i64 @foo(i64 %0, i64 %1, <10 x i64> %2) !dbg !45 {
; CHECK-NEXT: entry:
-; CHECK-NEXT: #dbg_declare(i64 0, !49, !DIExpression(), !58)
-; CHECK-NEXT: #dbg_declare(i64 0, !50, !DIExpression(), !58)
-; CHECK-NEXT: #dbg_declare(i64 0, !51, !DIExpression(), !58)
-; CHECK-NEXT: #dbg_label(!59, !58)
+; CHECK-NEXT: #dbg_declare(i64 0, !50, !DIExpression(), !59)
+; CHECK-NEXT: #dbg_declare(i64 0, !51, !DIExpression(), !59)
+; CHECK-NEXT: #dbg_declare(i64 0, !52, !DIExpression(), !59)
+; CHECK-NEXT: #dbg_label(!60, !59)
; CHECK-NEXT: br label %vars
-; CHECK-NEXT: #dbg_label(!60, !58)
+; CHECK-NEXT: #dbg_label(!61, !59)
; CHECK-NEXT: br label %vars
; CHECK: vars: ; preds = %entry, %entry
; CHECK-NEXT: %p1 = phi i64 [ 0, %entry ]
; CHECK-NEXT: %p2 = phi i64 [ 0, %entry ]
-; CHECK-NEXT: #dbg_value(i64 0, !42, !DIExpression(DW_OP_constu, 0, DW_OP_stack_value), !61)
-; CHECK-NEXT: #dbg_value(i64 1, !52, !DIExpression(DW_OP_constu, 1, DW_OP_stack_value), !61)
+; CHECK-NEXT: #dbg_value(i64 0, !43, !DIExpression(DW_OP_constu, 0, DW_OP_stack_value), !62)
+; CHECK-NEXT: #dbg_value(i64 1, !53, !DIExpression(DW_OP_constu, 1, DW_OP_stack_value), !62)
; CHECK-NEXT: %a = add i64 %p1, %p2
; CHECK-NEXT: ret i64 0
; CHECK-NEXT: }
; CHECK: !llvm.dbg.cu = !{!0}
-; CHECK-NEXT: !FooType = !{!33}
+; CHECK-NEXT: !FooType = !{!34}
; CHECK-NEXT: !EnumTest = !{!3}
; CHECK-NEXT: !LargeEnumTest = !{!11}
-; CHECK-NEXT: !SubrangeType = !{!36}
-; CHECK-NEXT: !SetType1 = !{!37}
-; CHECK-NEXT: !SetType2 = !{!38}
-; CHECK-NEXT: !DynType = !{!39}
-; CHECK-NEXT: !ClassType = !{!54}
+; CHECK-NEXT: !SubrangeType = !{!37}
+; CHECK-NEXT: !SetType1 = !{!38}
+; CHECK-NEXT: !SetType2 = !{!39}
+; CHECK-NEXT: !DynType = !{!40}
+; CHECK-NEXT: !ClassType = !{!55}
-; CHECK: !0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "llvm-c-test", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, globals: !16, imports: !24, macros: !28, splitDebugInlining: false, sysroot: "/")
+; CHECK: !0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "llvm-c-test", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, globals: !16, imports: !24, macros: !29, splitDebugInlining: false, sysroot: "/")
; CHECK-NEXT: !1 = !DIFile(filename: "debuginfo.c", directory: ".")
; CHECK-NEXT: !2 = !{!3, !11}
; CHECK-NEXT: !3 = !DICompositeType(tag: DW_TAG_enumeration_type, name: "EnumTest", scope: !4, file: !1, baseType: !6, size: 64, elements: !7)
@@ -57,41 +57,42 @@
; CHECK-NEXT: !21 = !DIGlobalVariableExpression(var: !22, expr: !DIExpression(DW_OP_constu, 0, DW_OP_stack_value))
; CHECK-NEXT: !22 = distinct !DIGlobalVariable(name: "global", scope: !5, file: !1, line: 1, type: !23, isLocal: true, isDefinition: true)
; CHECK-NEXT: !23 = !DIDerivedType(tag: DW_TAG_typedef, name: "int64_t", scope: !1, file: !1, line: 42, baseType: !6)
-; CHECK-NEXT: !24 = !{!25, !27}
-; CHECK-NEXT: !25 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !5, entity: !26, file: !1, line: 42)
+; CHECK-NEXT: !24 = !{!25, !28}
+; CHECK-NEXT: !25 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !5, entity: !26, file: !27, line: 42)
; CHECK-NEXT: !26 = !DIModule(scope: null, name: "llvm-c-test-import", includePath: "/test/include/llvm-c-test-import.h")
-; CHECK-NEXT: !27 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !5, entity: !25, file: !1, line: 42)
-; CHECK-NEXT: !28 = !{!29}
-; CHECK-NEXT: !29 = !DIMacroFile(file: !1, nodes: !30)
-; CHECK-NEXT: !30 = !{!31, !32}
-; CHECK-NEXT: !31 = !DIMacro(type: DW_MACINFO_define, name: "SIMPLE_DEFINE")
-; CHECK-NEXT: !32 = !DIMacro(type: DW_MACINFO_define, name: "VALUE_DEFINE", value: "1")
-; CHECK-NEXT: !33 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !34, size: 192, dwarfAddressSpace: 0)
-; CHECK-NEXT: !34 = !DICompositeType(tag: DW_TAG_structure_type, name: "MyStruct", scope: !4, file: !1, size: 192, elements: !35, runtimeLang: DW_LANG_C89, identifier: "MyStruct")
-; CHECK-NEXT: !35 = !{!6, !6, !6}
-; CHECK-NEXT: !36 = !DISubrangeType(name: "foo", scope: !1, file: !1, line: 42, size: 64, baseType: !6, lowerBound: i64 0, upperBound: i64 1, stride: i64 8, bias: i64 4)
-; CHECK-NEXT: !37 = !DIDerivedType(tag: DW_TAG_set_type, name: "enumset", scope: !1, file: !1, line: 42, baseType: !3, size: 64)
-; CHECK-NEXT: !38 = !DIDerivedType(tag: DW_TAG_set_type, name: "subrangeset", scope: !1, file: !1, line: 42, baseType: !36, size: 64)
-; CHECK-NEXT: !39 = !DICompositeType(tag: DW_TAG_array_type, name: "foo", scope: !1, file: !1, line: 42, baseType: !6, size: 640, elements: !40, dataLocation: !DIExpression(), associated: !42, rank: !DIExpression())
-; CHECK-NEXT: !40 = !{!41}
-; CHECK-NEXT: !41 = !DISubrange(count: 10, lowerBound: 0)
-; CHECK-NEXT: !42 = !DILocalVariable(name: "d", scope: !43, file: !1, line: 43, type: !6)
-; CHECK-NEXT: !43 = distinct !DILexicalBlock(scope: !44, file: !1, line: 42)
-; CHECK-NEXT: !44 = distinct !DISubprogram(name: "foo", linkageName: "foo", scope: !1, file: !1, line: 42, type: !45, scopeLine: 42, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !48)
-; CHECK-NEXT: !45 = !DISubroutineType(types: !46)
-; CHECK-NEXT: !46 = !{!6, !6, !47}
-; CHECK-NEXT: !47 = !DICompositeType(tag: DW_TAG_array_type, baseType: !6, size: 640, flags: DIFlagVector, elements: !40)
-; CHECK-NEXT: !48 = !{!49, !50, !51, !42, !52, !53}
-; CHECK-NEXT: !49 = !DILocalVariable(name: "a", arg: 1, scope: !44, file: !1, line: 42, type: !6)
-; CHECK-NEXT: !50 = !DILocalVariable(name: "b", arg: 2, scope: !44, file: !1, line: 42, type: !6)
-; CHECK-NEXT: !51 = !DILocalVariable(name: "c", arg: 3, scope: !44, file: !1, line: 42, type: !47)
-; CHECK-NEXT: !52 = !DILocalVariable(name: "e", scope: !43, file: !1, line: 44, type: !6)
-; CHECK-NEXT: !53 = !DILabel(scope: !44, name: "label3", file: !1, line: 42)
-; CHECK-NEXT: !54 = !DICompositeType(tag: DW_TAG_class_type, name: "Class", scope: !4, file: !1, size: 192, flags: DIFlagFwdDecl, elements: !55, identifier: "FooClass")
-; CHECK-NEXT: !55 = !{!56}
-; CHECK-NEXT: !56 = !{!6, !6, !57}
-; CHECK-NEXT: !57 = !DIBasicType(name: "Int32", size: 32)
-; CHECK-NEXT: !58 = !DILocation(line: 42, scope: !44)
-; CHECK-NEXT: !59 = !DILabel(scope: !44, name: "label1", file: !1, line: 42)
-; CHECK-NEXT: !60 = !DILabel(scope: !44, name: "label2", file: !1, line: 42)
-; CHECK-NEXT: !61 = !DILocation(line: 43, scope: !44)
+; CHECK-NEXT: !27 = !DIFile(filename: "debuginfo.c", directory: ".", checksumkind: CSK_MD5, checksum: "1234", source: "source")
+; CHECK-NEXT: !28 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !5, entity: !25, file: !1, line: 42)
+; CHECK-NEXT: !29 = !{!30}
+; CHECK-NEXT: !30 = !DIMacroFile(file: !1, nodes: !31)
+; CHECK-NEXT: !31 = !{!32, !33}
+; CHECK-NEXT: !32 = !DIMacro(type: DW_MACINFO_define, name: "SIMPLE_DEFINE")
+; CHECK-NEXT: !33 = !DIMacro(type: DW_MACINFO_define, name: "VALUE_DEFINE", value: "1")
+; CHECK-NEXT: !34 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !35, size: 192, dwarfAddressSpace: 0)
+; CHECK-NEXT: !35 = !DICompositeType(tag: DW_TAG_structure_type, name: "MyStruct", scope: !4, file: !1, size: 192, elements: !36, runtimeLang: DW_LANG_C89, identifier: "MyStruct")
+; CHECK-NEXT: !36 = !{!6, !6, !6}
+; CHECK-NEXT: !37 = !DISubrangeType(name: "foo", scope: !1, file: !1, line: 42, size: 64, baseType: !6, lowerBound: i64 0, upperBound: i64 1, stride: i64 8, bias: i64 4)
+; CHECK-NEXT: !38 = !DIDerivedType(tag: DW_TAG_set_type, name: "enumset", scope: !1, file: !1, line: 42, baseType: !3, size: 64)
+; CHECK-NEXT: !39 = !DIDerivedType(tag: DW_TAG_set_type, name: "subrangeset", scope: !1, file: !1, line: 42, baseType: !37, size: 64)
+; CHECK-NEXT: !40 = !DICompositeType(tag: DW_TAG_array_type, name: "foo", scope: !1, file: !1, line: 42, baseType: !6, size: 640, elements: !41, dataLocation: !DIExpression(), associated: !43, rank: !DIExpression())
+; CHECK-NEXT: !41 = !{!42}
+; CHECK-NEXT: !42 = !DISubrange(count: 10, lowerBound: 0)
+; CHECK-NEXT: !43 = !DILocalVariable(name: "d", scope: !44, file: !1, line: 43, type: !6)
+; CHECK-NEXT: !44 = distinct !DILexicalBlock(scope: !45, file: !1, line: 42)
+; CHECK-NEXT: !45 = distinct !DISubprogram(name: "foo", linkageName: "foo", scope: !1, file: !1, line: 42, type: !46, scopeLine: 42, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !49)
+; CHECK-NEXT: !46 = !DISubroutineType(types: !47)
+; CHECK-NEXT: !47 = !{!6, !6, !48}
+; CHECK-NEXT: !48 = !DICompositeType(tag: DW_TAG_array_type, baseType: !6, size: 640, flags: DIFlagVector, elements: !41)
+; CHECK-NEXT: !49 = !{!50, !51, !52, !43, !53, !54}
+; CHECK-NEXT: !50 = !DILocalVariable(name: "a", arg: 1, scope: !45, file: !1, line: 42, type: !6)
+; CHECK-NEXT: !51 = !DILocalVariable(name: "b", arg: 2, scope: !45, file: !1, line: 42, type: !6)
+; CHECK-NEXT: !52 = !DILocalVariable(name: "c", arg: 3, scope: !45, file: !1, line: 42, type: !48)
+; CHECK-NEXT: !53 = !DILocalVariable(name: "e", scope: !44, file: !1, line: 44, type: !6)
+; CHECK-NEXT: !54 = !DILabel(scope: !45, name: "label3", file: !1, line: 42)
+; CHECK-NEXT: !55 = !DICompositeType(tag: DW_TAG_class_type, name: "Class", scope: !4, file: !1, size: 192, flags: DIFlagFwdDecl, elements: !56, identifier: "FooClass")
+; CHECK-NEXT: !56 = !{!57}
+; CHECK-NEXT: !57 = !{!6, !6, !58}
+; CHECK-NEXT: !58 = !DIBasicType(name: "Int32", size: 32)
+; CHECK-NEXT: !59 = !DILocation(line: 42, scope: !45)
+; CHECK-NEXT: !60 = !DILabel(scope: !45, name: "label1", file: !1, line: 42)
+; CHECK-NEXT: !61 = !DILabel(scope: !45, name: "label2", file: !1, line: 42)
+; CHECK-NEXT: !62 = !DILocation(line: 43, scope: !45)
diff --git a/llvm/test/Bitcode/Inputs/compile-unit-no-versioned-language.bc b/llvm/test/Bitcode/Inputs/compile-unit-no-versioned-language.bc
new file mode 100644
index 0000000..461a34d0
--- /dev/null
+++ b/llvm/test/Bitcode/Inputs/compile-unit-no-versioned-language.bc
Binary files differ
diff --git a/llvm/test/Bitcode/dwarf-source-language-version.ll b/llvm/test/Bitcode/dwarf-source-language-version.ll
new file mode 100644
index 0000000..311afd5
--- /dev/null
+++ b/llvm/test/Bitcode/dwarf-source-language-version.ll
@@ -0,0 +1,17 @@
+; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s --implicit-check-not "sourceLanguageVersion: 0"
+
+; CHECK: sourceLanguageVersion: 120
+
+source_filename = "cu.cpp"
+target triple = "arm64-apple-macosx"
+
+!llvm.dbg.cu = !{!0, !5}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(sourceLanguageName: DW_LNAME_ObjC_plus_plus, sourceLanguageVersion: 120, file: !1, producer: "handwritten", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, globals: !2, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/")
+!1 = !DIFile(filename: "cu.cpp", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 7, !"Dwarf Version", i32 5}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = distinct !DICompileUnit(sourceLanguageName: DW_LNAME_ObjC_plus_plus, sourceLanguageVersion: 0, file: !6, producer: "handwritten", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, globals: !2, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/")
+!6 = !DIFile(filename: "cu2.cpp", directory: "/tmp")
diff --git a/llvm/test/Bitcode/upgrade-DICompileUnit-no-versioned-language.test b/llvm/test/Bitcode/upgrade-DICompileUnit-no-versioned-language.test
new file mode 100644
index 0000000..9475f9b
--- /dev/null
+++ b/llvm/test/Bitcode/upgrade-DICompileUnit-no-versioned-language.test
@@ -0,0 +1,21 @@
+; Test loading metadata which was not aware of versioned language names.
+;
+; RUN: llvm-dis -o - %p/Inputs/compile-unit-no-versioned-language.bc \
+; RUN: | FileCheck %s --implicit-check-not "sourceLanguageName" --implicit-check-not "sourceLanguageVersion"
+
+; Input bitcode file was compiled from following source on
+; LLVM commit `fc22b58c25963ece6b041cadbdc931c2338955e4`:
+;
+; source_filename = "cu.cpp"
+; target triple = "arm64-apple-macosx"
+;
+; !llvm.dbg.cu = !{!0}
+; !llvm.module.flags = !{!3, !4}
+;
+; !0 = distinct !DICompileUnit(language: DW_LANG_ObjC, file: !1, producer: "handwritten", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, globals: !2, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/")
+; !1 = !DIFile(filename: "cu.cpp", directory: "/tmp")
+; !2 = !{}
+; !3 = !{i32 7, !"Dwarf Version", i32 5}
+; !4 = !{i32 2, !"Debug Info Version", i32 3}
+
+; CHECK: distinct !DICompileUnit(language: DW_LANG_ObjC,
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-add.mir b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-add.mir
new file mode 100644
index 0000000..824ada1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-add.mir
@@ -0,0 +1,278 @@
+# NOTE: Assertions have been autogenerated by utils/update_givaluetracking_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=aarch64 -passes="print<gisel-value-tracking>" -filetype=null %s 2>&1 | FileCheck %s
+
+---
+name: Cst
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @Cst
+ ; CHECK-NEXT: %0:_ KnownBits:00000010 SignBits:6
+ ; CHECK-NEXT: %1:_ KnownBits:00011000 SignBits:3
+ ; CHECK-NEXT: %2:_ KnownBits:00011010 SignBits:3
+ %0:_(s8) = G_CONSTANT i8 2
+ %1:_(s8) = G_CONSTANT i8 24
+ %2:_(s8) = G_ADD %0, %1
+...
+---
+name: CstZero
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @CstZero
+ ; CHECK-NEXT: %0:_ KnownBits:00000001 SignBits:7
+ ; CHECK-NEXT: %1:_ KnownBits:11111111 SignBits:8
+ ; CHECK-NEXT: %2:_ KnownBits:00000000 SignBits:8
+ %0:_(s8) = G_CONSTANT i8 1
+ %1:_(s8) = G_CONSTANT i8 255
+ %2:_(s8) = G_ADD %0, %1
+...
+---
+name: CstNegOne
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @CstNegOne
+ ; CHECK-NEXT: %0:_ KnownBits:00000000 SignBits:8
+ ; CHECK-NEXT: %1:_ KnownBits:11111111 SignBits:8
+ ; CHECK-NEXT: %2:_ KnownBits:11111111 SignBits:8
+ %0:_(s8) = G_CONSTANT i8 0
+ %1:_(s8) = G_CONSTANT i8 255
+ %2:_(s8) = G_ADD %0, %1
+...
+---
+name: CstSeven
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @CstSeven
+ ; CHECK-NEXT: %0:_ KnownBits:00001000 SignBits:4
+ ; CHECK-NEXT: %1:_ KnownBits:11111111 SignBits:8
+ ; CHECK-NEXT: %2:_ KnownBits:00000111 SignBits:5
+ %0:_(s8) = G_CONSTANT i8 8
+ %1:_(s8) = G_CONSTANT i8 255
+ %2:_(s8) = G_ADD %0, %1
+...
+---
+name: CstNeg
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @CstNeg
+ ; CHECK-NEXT: %0:_ KnownBits:11100000 SignBits:3
+ ; CHECK-NEXT: %1:_ KnownBits:00000010 SignBits:6
+ ; CHECK-NEXT: %2:_ KnownBits:11100010 SignBits:3
+ %0:_(s8) = G_CONSTANT i8 224
+ %1:_(s8) = G_CONSTANT i8 2
+ %2:_(s8) = G_ADD %0, %1
+...
+---
+name: ScalarVar
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @ScalarVar
+ ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:???????? SignBits:1
+ ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1
+ %0:_(s8) = COPY $b0
+ %1:_(s8) = COPY $b1
+ %2:_(s8) = G_ADD %0, %1
+...
+---
+name: ScalarRhsEarlyOut
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @ScalarRhsEarlyOut
+ ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:00000011 SignBits:6
+ ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1
+ %0:_(s8) = COPY $b0
+ %1:_(s8) = G_CONSTANT i8 3
+ %2:_(s8) = G_ADD %0, %1
+...
+---
+name: ScalarNonNegative
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @ScalarNonNegative
+ ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:00001111 SignBits:4
+ ; CHECK-NEXT: %2:_ KnownBits:0000???? SignBits:4
+ ; CHECK-NEXT: %3:_ KnownBits:11111111 SignBits:8
+ ; CHECK-NEXT: %4:_ KnownBits:???????? SignBits:4
+ %0:_(s8) = COPY $b0
+ %1:_(s8) = G_CONSTANT i8 15
+ %2:_(s8) = G_AND %0, %1
+ %3:_(s8) = G_CONSTANT i8 255
+ %4:_(s8) = G_ADD %2, %3
+...
+---
+name: ScalarLhsEarlyOut
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @ScalarLhsEarlyOut
+ ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:00000011 SignBits:6
+ ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1
+ %0:_(s8) = COPY $b0
+ %1:_(s8) = G_CONSTANT i8 3
+ %2:_(s8) = G_ADD %1, %0
+...
+---
+name: ScalarPartKnown
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @ScalarPartKnown
+ ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:00001111 SignBits:4
+ ; CHECK-NEXT: %2:_ KnownBits:0000???? SignBits:4
+ ; CHECK-NEXT: %3:_ KnownBits:00000101 SignBits:5
+ ; CHECK-NEXT: %4:_ KnownBits:000????? SignBits:3
+ %0:_(s8) = COPY $b0
+ %1:_(s8) = G_CONSTANT i8 15
+ %2:_(s8) = G_AND %0, %1
+ %3:_(s8) = G_CONSTANT i8 5
+ %4:_(s8) = G_ADD %2, %3
+...
+---
+name: VectorCstZero
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @VectorCstZero
+ ; CHECK-NEXT: %0:_ KnownBits:0000000000000001 SignBits:15
+ ; CHECK-NEXT: %1:_ KnownBits:1111111111111111 SignBits:16
+ ; CHECK-NEXT: %2:_ KnownBits:0000000000000001 SignBits:15
+ ; CHECK-NEXT: %3:_ KnownBits:1111111111111111 SignBits:16
+ ; CHECK-NEXT: %4:_ KnownBits:0000000000000000 SignBits:16
+ %0:_(s16) = G_CONSTANT i16 1
+ %1:_(s16) = G_CONSTANT i16 65535
+ %2:_(<4 x s16>) = G_BUILD_VECTOR %0, %0, %0, %0
+ %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+ %4:_(<4 x s16>) = G_ADD %2, %3
+...
+---
+name: VectorCstNegOne
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @VectorCstNegOne
+ ; CHECK-NEXT: %0:_ KnownBits:0000000000000000 SignBits:16
+ ; CHECK-NEXT: %1:_ KnownBits:1111111111111111 SignBits:16
+ ; CHECK-NEXT: %2:_ KnownBits:0000000000000000 SignBits:16
+ ; CHECK-NEXT: %3:_ KnownBits:1111111111111111 SignBits:16
+ ; CHECK-NEXT: %4:_ KnownBits:1111111111111111 SignBits:16
+ %0:_(s16) = G_CONSTANT i16 0
+ %1:_(s16) = G_CONSTANT i16 65535
+ %2:_(<4 x s16>) = G_BUILD_VECTOR %0, %0, %0, %0
+ %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+ %4:_(<4 x s16>) = G_ADD %2, %3
+...
+---
+name: VectorVar
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @VectorVar
+ ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:???????????????? SignBits:1
+ ; CHECK-NEXT: %2:_ KnownBits:???????????????? SignBits:1
+ %0:_(<4 x s16>) = COPY $d0
+ %1:_(<4 x s16>) = COPY $d1
+ %2:_(<4 x s16>) = G_ADD %0, %1
+...
+---
+name: VectorRhsEarlyOut
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @VectorRhsEarlyOut
+ ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:0000000000000011 SignBits:14
+ ; CHECK-NEXT: %2:_ KnownBits:0000000000000011 SignBits:14
+ ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1
+ %0:_(<4 x s16>) = COPY $d0
+ %1:_(s16) = G_CONSTANT i16 3
+ %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+ %3:_(<4 x s16>) = G_ADD %2, %0
+...
+---
+name: VectorNonNegative
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @VectorNonNegative
+ ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:0000000011111111 SignBits:8
+ ; CHECK-NEXT: %2:_ KnownBits:0000000011111111 SignBits:8
+ ; CHECK-NEXT: %3:_ KnownBits:00000000???????? SignBits:8
+ ; CHECK-NEXT: %4:_ KnownBits:1111111111111111 SignBits:16
+ ; CHECK-NEXT: %5:_ KnownBits:1111111111111111 SignBits:16
+ ; CHECK-NEXT: %6:_ KnownBits:???????????????? SignBits:8
+ %0:_(<4 x s16>) = COPY $d0
+ %1:_(s16) = G_CONSTANT i16 255
+ %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+ %3:_(<4 x s16>) = G_AND %0, %2
+ %4:_(s16) = G_CONSTANT i16 65535
+ %5:_(<4 x s16>) = G_BUILD_VECTOR %4, %4, %4, %4
+ %6:_(<4 x s16>) = G_ADD %3, %5
+...
+---
+name: VectorLhsEarlyOut
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @VectorLhsEarlyOut
+ ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:0000000000000011 SignBits:14
+ ; CHECK-NEXT: %2:_ KnownBits:0000000000000011 SignBits:14
+ ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1
+ %0:_(<4 x s16>) = COPY $d0
+ %1:_(s16) = G_CONSTANT i16 3
+ %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+ %3:_(<4 x s16>) = G_ADD %0, %2
+...
+---
+name: VectorPartKnown
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @VectorPartKnown
+ ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:0000000011111111 SignBits:8
+ ; CHECK-NEXT: %2:_ KnownBits:0000000011111111 SignBits:8
+ ; CHECK-NEXT: %3:_ KnownBits:00000000???????? SignBits:8
+ ; CHECK-NEXT: %4:_ KnownBits:0000000000101010 SignBits:10
+ ; CHECK-NEXT: %5:_ KnownBits:0000000001001010 SignBits:9
+ ; CHECK-NEXT: %6:_ KnownBits:000000000??01010 SignBits:9
+ ; CHECK-NEXT: %7:_ KnownBits:0000000????????? SignBits:7
+ %0:_(<4 x s16>) = COPY $d0
+ %1:_(s16) = G_CONSTANT i16 255
+ %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+ %3:_(<4 x s16>) = G_AND %0, %2
+ %4:_(s16) = G_CONSTANT i16 42
+ %5:_(s16) = G_CONSTANT i16 74
+ %6:_(<4 x s16>) = G_BUILD_VECTOR %4, %5, %5, %4
+ %7:_(<4 x s16>) = G_ADD %6, %3
+...
+---
+name: VectorCst36
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @VectorCst36
+ ; CHECK-NEXT: %0:_ KnownBits:0000000000000011 SignBits:14
+ ; CHECK-NEXT: %1:_ KnownBits:0000000000000110 SignBits:13
+ ; CHECK-NEXT: %2:_ KnownBits:0000000000000?1? SignBits:13
+ ; CHECK-NEXT: %3:_ KnownBits:0000000000000?1? SignBits:13
+ ; CHECK-NEXT: %4:_ KnownBits:000000000000???? SignBits:12
+ %0:_(s16) = G_CONSTANT i16 3
+ %1:_(s16) = G_CONSTANT i16 6
+ %2:_(<4 x s16>) = G_BUILD_VECTOR %0, %1, %1, %0
+ %3:_(<4 x s16>) = G_BUILD_VECTOR %0, %1, %1, %0
+ %4:_(<4 x s16>) = G_ADD %2, %3
+...
+
+---
+name: VectorCst3unknown
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @VectorCst3unknown
+ ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:???????????????? SignBits:1
+ ; CHECK-NEXT: %2:_ KnownBits:0000000000000011 SignBits:14
+ ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1
+ ; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:1
+ %0:_(<4 x s16>) = COPY $d0
+ %1:_(s16) = COPY $h0
+ %2:_(s16) = G_CONSTANT i16 3
+ %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %2, %2, %1
+ %4:_(<4 x s16>) = G_ADD %0, %3
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-ashr.mir b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-ashr.mir
index 8552931..ee35447 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-ashr.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-ashr.mir
@@ -102,8 +102,8 @@ body: |
; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1
; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:1
%0:_(<4 x s16>) = COPY $d0
- %2:_(s16) = COPY $h0
- %1:_(s16) = G_CONSTANT i16 3
+ %1:_(s16) = COPY $h0
+ %2:_(s16) = G_CONSTANT i16 3
%3:_(<4 x s16>) = G_BUILD_VECTOR %1, %2, %2, %1
%4:_(<4 x s16>) = G_ASHR %0, %3
...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-shl.mir b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-shl.mir
index 61d1c43..97bcb80 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-shl.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-shl.mir
@@ -135,8 +135,8 @@ body: |
; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1
; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:1
%0:_(<4 x s16>) = COPY $d0
- %2:_(s16) = COPY $h0
- %1:_(s16) = G_CONSTANT i16 3
+ %1:_(s16) = COPY $h0
+ %2:_(s16) = G_CONSTANT i16 3
%3:_(<4 x s16>) = G_BUILD_VECTOR %1, %2, %2, %1
%4:_(<4 x s16>) = G_SHL %0, %3
...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-sub.mir b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-sub.mir
new file mode 100644
index 0000000..332049d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-sub.mir
@@ -0,0 +1,276 @@
+# NOTE: Assertions have been autogenerated by utils/update_givaluetracking_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=aarch64 -passes="print<gisel-value-tracking>" -filetype=null %s 2>&1 | FileCheck %s
+
+---
+name: Cst
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @Cst
+ ; CHECK-NEXT: %0:_ KnownBits:00000010 SignBits:6
+ ; CHECK-NEXT: %1:_ KnownBits:11100000 SignBits:3
+ ; CHECK-NEXT: %2:_ KnownBits:00100010 SignBits:2
+ %0:_(s8) = G_CONSTANT i8 2
+ %1:_(s8) = G_CONSTANT i8 224
+ %2:_(s8) = G_SUB %0, %1
+...
+---
+name: CstZero
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @CstZero
+ ; CHECK-NEXT: %0:_ KnownBits:00000000 SignBits:8
+ ; CHECK-NEXT: %1:_ KnownBits:00000000 SignBits:8
+ ; CHECK-NEXT: %2:_ KnownBits:00000000 SignBits:8
+ %0:_(s8) = G_CONSTANT i8 0
+ %1:_(s8) = G_CONSTANT i8 0
+ %2:_(s8) = G_SUB %0, %1
+...
+---
+name: CstNegOne
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @CstNegOne
+ ; CHECK-NEXT: %0:_ KnownBits:00000000 SignBits:8
+ ; CHECK-NEXT: %1:_ KnownBits:00000001 SignBits:7
+ ; CHECK-NEXT: %2:_ KnownBits:11111111 SignBits:8
+ %0:_(s8) = G_CONSTANT i8 0
+ %1:_(s8) = G_CONSTANT i8 1
+ %2:_(s8) = G_SUB %0, %1
+...
+---
+name: CstNegFour
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @CstNegFour
+ ; CHECK-NEXT: %0:_ KnownBits:00000000 SignBits:8
+ ; CHECK-NEXT: %1:_ KnownBits:00000100 SignBits:5
+ ; CHECK-NEXT: %2:_ KnownBits:11111100 SignBits:6
+ %0:_(s8) = G_CONSTANT i8 0
+ %1:_(s8) = G_CONSTANT i8 4
+ %2:_(s8) = G_SUB %0, %1
+...
+---
+name: CstNeg
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @CstNeg
+ ; CHECK-NEXT: %0:_ KnownBits:11100000 SignBits:3
+ ; CHECK-NEXT: %1:_ KnownBits:00000010 SignBits:6
+ ; CHECK-NEXT: %2:_ KnownBits:11011110 SignBits:2
+ %0:_(s8) = G_CONSTANT i8 224
+ %1:_(s8) = G_CONSTANT i8 2
+ %2:_(s8) = G_SUB %0, %1
+...
+---
+name: ScalarVar
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @ScalarVar
+ ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:???????? SignBits:1
+ ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1
+ %0:_(s8) = COPY $b0
+ %1:_(s8) = COPY $b1
+ %2:_(s8) = G_SUB %0, %1
+...
+---
+name: ScalarRhsEarlyOut
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @ScalarRhsEarlyOut
+ ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:00000011 SignBits:6
+ ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1
+ %0:_(s8) = COPY $b0
+ %1:_(s8) = G_CONSTANT i8 3
+ %2:_(s8) = G_SUB %0, %1
+...
+---
+name: ScalarNonNegative
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @ScalarNonNegative
+ ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:00001111 SignBits:4
+ ; CHECK-NEXT: %2:_ KnownBits:0000???? SignBits:4
+ ; CHECK-NEXT: %3:_ KnownBits:00000000 SignBits:8
+ ; CHECK-NEXT: %4:_ KnownBits:???????? SignBits:4
+ %0:_(s8) = COPY $b0
+ %1:_(s8) = G_CONSTANT i8 15
+ %2:_(s8) = G_AND %0, %1
+ %3:_(s8) = G_CONSTANT i8 0
+ %4:_(s8) = G_SUB %3, %2
+...
+---
+name: ScalarLhsEarlyOut
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @ScalarLhsEarlyOut
+ ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:00000011 SignBits:6
+ ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1
+ %0:_(s8) = COPY $b0
+ %1:_(s8) = G_CONSTANT i8 3
+ %2:_(s8) = G_SUB %1, %0
+...
+---
+name: ScalarPartKnown
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @ScalarPartKnown
+ ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:00001111 SignBits:4
+ ; CHECK-NEXT: %2:_ KnownBits:0000???? SignBits:4
+ ; CHECK-NEXT: %3:_ KnownBits:00000101 SignBits:5
+ ; CHECK-NEXT: %4:_ KnownBits:???????? SignBits:3
+ %0:_(s8) = COPY $b0
+ %1:_(s8) = G_CONSTANT i8 15
+ %2:_(s8) = G_AND %0, %1
+ %3:_(s8) = G_CONSTANT i8 5
+ %4:_(s8) = G_SUB %2, %3
+...
+---
+name: VectorCstZero
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @VectorCstZero
+ ; CHECK-NEXT: %0:_ KnownBits:0000000000000000 SignBits:16
+ ; CHECK-NEXT: %1:_ KnownBits:0000000000000000 SignBits:16
+ ; CHECK-NEXT: %2:_ KnownBits:0000000000000000 SignBits:16
+ ; CHECK-NEXT: %3:_ KnownBits:0000000000000000 SignBits:16
+ %0:_(s16) = G_CONSTANT i16 0
+ %1:_(<4 x s16>) = G_BUILD_VECTOR %0, %0, %0, %0
+ %2:_(<4 x s16>) = G_BUILD_VECTOR %0, %0, %0, %0
+ %3:_(<4 x s16>) = G_SUB %1, %2
+...
+---
+name: VectorCstNegOne
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @VectorCstNegOne
+ ; CHECK-NEXT: %0:_ KnownBits:0000000000000000 SignBits:16
+ ; CHECK-NEXT: %1:_ KnownBits:0000000000000001 SignBits:15
+ ; CHECK-NEXT: %2:_ KnownBits:0000000000000000 SignBits:16
+ ; CHECK-NEXT: %3:_ KnownBits:0000000000000001 SignBits:15
+ ; CHECK-NEXT: %4:_ KnownBits:1111111111111111 SignBits:16
+ %0:_(s16) = G_CONSTANT i16 0
+ %1:_(s16) = G_CONSTANT i16 1
+ %2:_(<4 x s16>) = G_BUILD_VECTOR %0, %0, %0, %0
+ %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+ %4:_(<4 x s16>) = G_SUB %2, %3
+...
+---
+name: VectorVar
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @VectorVar
+ ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:???????????????? SignBits:1
+ ; CHECK-NEXT: %2:_ KnownBits:???????????????? SignBits:1
+ %0:_(<4 x s16>) = COPY $d0
+ %1:_(<4 x s16>) = COPY $d1
+ %2:_(<4 x s16>) = G_SUB %0, %1
+...
+---
+name: VectorRhsEarlyOut
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @VectorRhsEarlyOut
+ ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:0000000000000011 SignBits:14
+ ; CHECK-NEXT: %2:_ KnownBits:0000000000000011 SignBits:14
+ ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1
+ %0:_(<4 x s16>) = COPY $d0
+ %1:_(s16) = G_CONSTANT i16 3
+ %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+ %3:_(<4 x s16>) = G_SUB %2, %0
+...
+---
+name: VectorNonNegative
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @VectorNonNegative
+ ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:0000000011111111 SignBits:8
+ ; CHECK-NEXT: %2:_ KnownBits:0000000011111111 SignBits:8
+ ; CHECK-NEXT: %3:_ KnownBits:00000000???????? SignBits:8
+ ; CHECK-NEXT: %4:_ KnownBits:0000000000000000 SignBits:16
+ ; CHECK-NEXT: %5:_ KnownBits:0000000000000000 SignBits:16
+ ; CHECK-NEXT: %6:_ KnownBits:???????????????? SignBits:8
+ %0:_(<4 x s16>) = COPY $d0
+ %1:_(s16) = G_CONSTANT i16 255
+ %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+ %3:_(<4 x s16>) = G_AND %0, %2
+ %4:_(s16) = G_CONSTANT i16 0
+ %5:_(<4 x s16>) = G_BUILD_VECTOR %4, %4, %4, %4
+ %6:_(<4 x s16>) = G_SUB %5, %3
+...
+---
+name: VectorLhsEarlyOut
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @VectorLhsEarlyOut
+ ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:0000000000000011 SignBits:14
+ ; CHECK-NEXT: %2:_ KnownBits:0000000000000011 SignBits:14
+ ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1
+ %0:_(<4 x s16>) = COPY $d0
+ %1:_(s16) = G_CONSTANT i16 3
+ %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+ %3:_(<4 x s16>) = G_SUB %0, %2
+...
+---
+name: VectorPartKnown
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @VectorPartKnown
+ ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:0000000011111111 SignBits:8
+ ; CHECK-NEXT: %2:_ KnownBits:0000000011111111 SignBits:8
+ ; CHECK-NEXT: %3:_ KnownBits:00000000???????? SignBits:8
+ ; CHECK-NEXT: %4:_ KnownBits:0000000000101010 SignBits:10
+ ; CHECK-NEXT: %5:_ KnownBits:0000000001001010 SignBits:9
+ ; CHECK-NEXT: %6:_ KnownBits:000000000??01010 SignBits:9
+ ; CHECK-NEXT: %7:_ KnownBits:???????????????? SignBits:7
+ %0:_(<4 x s16>) = COPY $d0
+ %1:_(s16) = G_CONSTANT i16 255
+ %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+ %3:_(<4 x s16>) = G_AND %0, %2
+ %4:_(s16) = G_CONSTANT i16 42
+ %5:_(s16) = G_CONSTANT i16 74
+ %6:_(<4 x s16>) = G_BUILD_VECTOR %4, %5, %5, %4
+ %7:_(<4 x s16>) = G_SUB %6, %3
+...
+---
+name: VectorCst36
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @VectorCst36
+ ; CHECK-NEXT: %0:_ KnownBits:0000000000000011 SignBits:14
+ ; CHECK-NEXT: %1:_ KnownBits:0000000000000110 SignBits:13
+ ; CHECK-NEXT: %2:_ KnownBits:0000000000000?1? SignBits:13
+ ; CHECK-NEXT: %3:_ KnownBits:0000000000000?1? SignBits:13
+ ; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:12
+ %0:_(s16) = G_CONSTANT i16 3
+ %1:_(s16) = G_CONSTANT i16 6
+ %2:_(<4 x s16>) = G_BUILD_VECTOR %0, %1, %1, %0
+ %3:_(<4 x s16>) = G_BUILD_VECTOR %0, %1, %1, %0
+ %4:_(<4 x s16>) = G_SUB %2, %3
+...
+
+---
+name: VectorCst3unknown
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @VectorCst3unknown
+ ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:???????????????? SignBits:1
+ ; CHECK-NEXT: %2:_ KnownBits:0000000000000011 SignBits:14
+ ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1
+ ; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:1
+ %0:_(<4 x s16>) = COPY $d0
+ %1:_(s16) = COPY $h0
+ %2:_(s16) = G_CONSTANT i16 3
+ %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %2, %2, %1
+ %4:_(<4 x s16>) = G_SUB %0, %3
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-vector-compress.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-vector-compress.mir
index cc75774..c2bf95c 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-vector-compress.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-vector-compress.mir
@@ -15,8 +15,9 @@ body: |
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s32>), [[C1]](s64)
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[C1]](s64)
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
- ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[C1]], [[C2]]
+ ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[COPY2]], [[C2]]
; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL]](s64)
; CHECK-NEXT: G_STORE [[EVEC]](s32), [[PTR_ADD]](p0) :: (store (s32))
; CHECK-NEXT: [[EVEC1:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[COPY1]](<4 x s16>), [[C1]](s64)
@@ -91,7 +92,8 @@ body: |
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32))
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s32>), [[C3]](s64)
- ; CHECK-NEXT: [[MUL1:%[0-9]+]]:_(s64) = G_MUL [[C3]], [[C2]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[C3]](s64)
+ ; CHECK-NEXT: [[MUL1:%[0-9]+]]:_(s64) = G_MUL [[COPY3]], [[C2]]
; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL1]](s64)
; CHECK-NEXT: G_STORE [[EVEC]](s32), [[PTR_ADD1]](p0) :: (store (s32))
; CHECK-NEXT: [[EVEC1:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[COPY1]](<4 x s16>), [[C3]](s64)
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matmul.ll b/llvm/test/CodeGen/AArch64/aarch64-matmul.ll
index 649d0a9..e7e9ee7 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matmul.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matmul.ll
@@ -1,41 +1,54 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon,+i8mm < %s -o -| FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+i8mm < %s | FileCheck %s
+; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+i8mm -global-isel < %s | FileCheck %s
define <4 x i32> @smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: smmla.v4i32.v16i8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: smmla v0.4s, v1.16b, v2.16b
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: smmla.v4i32.v16i8
-; CHECK: smmla v0.4s, v1.16b, v2.16b
%vmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
ret <4 x i32> %vmmla1.i
}
define <4 x i32> @ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: ummla.v4i32.v16i8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ummla v0.4s, v1.16b, v2.16b
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: ummla.v4i32.v16i8
-; CHECK: ummla v0.4s, v1.16b, v2.16b
%vmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
ret <4 x i32> %vmmla1.i
}
define <4 x i32> @usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: usmmla.v4i32.v16i8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: usmmla v0.4s, v1.16b, v2.16b
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: usmmla.v4i32.v16i8
-; CHECK: usmmla v0.4s, v1.16b, v2.16b
%vusmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) #3
ret <4 x i32> %vusmmla1.i
}
define <2 x i32> @usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: usdot.v2i32.v8i8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: usdot v0.2s, v1.8b, v2.8b
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: usdot.v2i32.v8i8
-; CHECK: usdot v0.2s, v1.8b, v2.8b
%vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b)
ret <2 x i32> %vusdot1.i
}
define <2 x i32> @usdot_lane.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: usdot_lane.v2i32.v8i8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT: usdot v0.2s, v1.8b, v2.4b[0]
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: usdot_lane.v2i32.v8i8
-; CHECK: usdot v0.2s, v1.8b, v2.4b[0]
%0 = bitcast <8 x i8> %b to <2 x i32>
%shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <2 x i32> zeroinitializer
%1 = bitcast <2 x i32> %shuffle to <8 x i8>
@@ -44,9 +57,12 @@ entry:
}
define <2 x i32> @sudot_lane.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: sudot_lane.v2i32.v8i8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT: sudot v0.2s, v1.8b, v2.4b[0]
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: sudot_lane.v2i32.v8i8
-; CHECK: sudot v0.2s, v1.8b, v2.4b[0]
%0 = bitcast <8 x i8> %b to <2 x i32>
%shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <2 x i32> zeroinitializer
%1 = bitcast <2 x i32> %shuffle to <8 x i8>
@@ -55,9 +71,11 @@ entry:
}
define <2 x i32> @usdot_lane.v2i32.v16i8(<2 x i32> %r, <8 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: usdot_lane.v2i32.v16i8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: usdot v0.2s, v1.8b, v2.4b[0]
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: usdot_lane.v2i32.v16i8
-; CHECK: usdot v0.2s, v1.8b, v2.4b[0]
%0 = bitcast <16 x i8> %b to <4 x i32>
%shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> zeroinitializer
%1 = bitcast <2 x i32> %shuffle to <8 x i8>
@@ -66,9 +84,11 @@ entry:
}
define <2 x i32> @sudot_lane.v2i32.v16i8(<2 x i32> %r, <8 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: sudot_lane.v2i32.v16i8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sudot v0.2s, v1.8b, v2.4b[0]
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: sudot_lane.v2i32.v16i8
-; CHECK: sudot v0.2s, v1.8b, v2.4b[0]
%0 = bitcast <16 x i8> %b to <4 x i32>
%shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> zeroinitializer
%1 = bitcast <2 x i32> %shuffle to <8 x i8>
@@ -77,17 +97,22 @@ entry:
}
define <4 x i32> @usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: usdot.v4i32.v16i8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: usdot v0.4s, v1.16b, v2.16b
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: usdot.v4i32.v16i8
-; CHECK: usdot v0.4s, v1.16b, v2.16b
%vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) #3
ret <4 x i32> %vusdot1.i
}
define <4 x i32> @usdot_lane.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: usdot_lane.v4i32.v16i8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT: usdot v0.4s, v1.16b, v2.4b[0]
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: usdot_lane.v4i32.v16i8
-; CHECK: usdot v0.4s, v1.16b, v2.4b[0]
%0 = bitcast <8 x i8> %b to <2 x i32>
%shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> zeroinitializer
%1 = bitcast <4 x i32> %shuffle to <16 x i8>
@@ -96,9 +121,12 @@ entry:
}
define <4 x i32> @sudot_lane.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: sudot_lane.v4i32.v16i8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT: sudot v0.4s, v1.16b, v2.4b[0]
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: sudot_lane.v4i32.v16i8
-; CHECK: sudot v0.4s, v1.16b, v2.4b[0]
%0 = bitcast <8 x i8> %b to <2 x i32>
%shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> zeroinitializer
%1 = bitcast <4 x i32> %shuffle to <16 x i8>
@@ -107,9 +135,11 @@ entry:
}
define <4 x i32> @usdot_laneq.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: usdot_laneq.v4i32.v16i8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: usdot v0.4s, v1.16b, v2.4b[0]
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: usdot_laneq.v4i32.v16i8
-; CHECK: usdot v0.4s, v1.16b, v2.4b[0]
%0 = bitcast <16 x i8> %b to <4 x i32>
%shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
%1 = bitcast <4 x i32> %shuffle to <16 x i8>
@@ -118,9 +148,11 @@ entry:
}
define <4 x i32> @sudot_laneq.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: sudot_laneq.v4i32.v16i8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sudot v0.4s, v1.16b, v2.4b[0]
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: sudot_laneq.v4i32.v16i8
-; CHECK: sudot v0.4s, v1.16b, v2.4b[0]
%0 = bitcast <16 x i8> %b to <4 x i32>
%shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
%1 = bitcast <4 x i32> %shuffle to <16 x i8>
@@ -133,4 +165,3 @@ declare <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16
declare <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
declare <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>) #2
declare <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
-
diff --git a/llvm/test/CodeGen/AArch64/aarch64-post-coalescer.mir b/llvm/test/CodeGen/AArch64/aarch64-post-coalescer.mir
new file mode 100644
index 0000000..6540160
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-post-coalescer.mir
@@ -0,0 +1,16 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=aarch64 -mattr=+sme -run-pass=aarch64-post-coalescer-pass -o - %s | FileCheck %s
+
+---
+name: foo
+machineFunctionInfo:
+ hasStreamingModeChanges: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: foo
+ ; CHECK: $d0 = COPY undef %0:fpr64
+ ; CHECK-NEXT: FAKE_USE implicit $d0
+ %1:fpr64 = COALESCER_BARRIER_FPR64 undef %1
+ $d0 = COPY %1
+ FAKE_USE implicit $d0
+...
diff --git a/llvm/test/CodeGen/AArch64/adds_cmn.ll b/llvm/test/CodeGen/AArch64/adds_cmn.ll
index aa070b7..9b456a5 100644
--- a/llvm/test/CodeGen/AArch64/adds_cmn.ll
+++ b/llvm/test/CodeGen/AArch64/adds_cmn.ll
@@ -22,10 +22,8 @@ entry:
define { i32, i32 } @adds_cmn_c(i32 noundef %x, i32 noundef %y) {
; CHECK-LABEL: adds_cmn_c:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: cmn w0, w1
-; CHECK-NEXT: add w1, w1, w0
-; CHECK-NEXT: cset w8, lo
-; CHECK-NEXT: mov w0, w8
+; CHECK-NEXT: adds w1, w0, w1
+; CHECK-NEXT: cset w0, lo
; CHECK-NEXT: ret
entry:
%0 = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %x, i32 %y)
diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmove-fpr.ll
index a0f1b71..bb362d2 100644
--- a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmove-fpr.ll
@@ -4,7 +4,7 @@
; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 -mattr=-zcm-fpr128 | FileCheck %s -check-prefixes=NOZCM-FPR128-ATTR --match-full-lines
; RUN: llc < %s -mtriple=arm64-apple-macosx -mattr=+zcm-fpr128 | FileCheck %s -check-prefixes=ZCM-FPR128-ATTR --match-full-lines
-define void @zero_cycle_regmov_FPR64(double %a, double %b, double %c, double %d) {
+define void @zero_cycle_regmove_FPR64(double %a, double %b, double %c, double %d) {
entry:
; CHECK-LABEL: t:
; NOZCM-FPR128-CPU: fmov d0, d2
@@ -45,7 +45,7 @@ entry:
declare float @foo_double(double, double)
-define void @zero_cycle_regmov_FPR32(float %a, float %b, float %c, float %d) {
+define void @zero_cycle_regmove_FPR32(float %a, float %b, float %c, float %d) {
entry:
; CHECK-LABEL: t:
; NOZCM-FPR128-CPU: fmov s0, s2
@@ -86,7 +86,7 @@ entry:
declare float @foo_float(float, float)
-define void @zero_cycle_regmov_FPR16(half %a, half %b, half %c, half %d) {
+define void @zero_cycle_regmove_FPR16(half %a, half %b, half %c, half %d) {
entry:
; CHECK-LABEL: t:
; NOZCM-FPR128-CPU: fmov s0, s2
diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-gpr.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmove-gpr.ll
index e14e69b..d6d3f15 100644
--- a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-gpr.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmove-gpr.ll
@@ -4,7 +4,7 @@
; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 -mattr=-zcm-gpr64 | FileCheck %s -check-prefixes=NOTATTR --match-full-lines
; RUN: llc < %s -mtriple=arm64-apple-macosx -mattr=+zcm-gpr64 | FileCheck %s -check-prefixes=ATTR --match-full-lines
-define void @zero_cycle_regmov_GPR32(i32 %a, i32 %b, i32 %c, i32 %d) {
+define void @zero_cycle_regmove_GPR32(i32 %a, i32 %b, i32 %c, i32 %d) {
entry:
; CHECK-LABEL: t:
; NOTCPU-LINUX: mov w0, w2
diff --git a/llvm/test/CodeGen/AArch64/combine-sdiv.ll b/llvm/test/CodeGen/AArch64/combine-sdiv.ll
index dc88f94..cca190f 100644
--- a/llvm/test/CodeGen/AArch64/combine-sdiv.ll
+++ b/llvm/test/CodeGen/AArch64/combine-sdiv.ll
@@ -1774,3 +1774,88 @@ define i128 @combine_i128_sdiv_const100(i128 %x) {
%1 = sdiv i128 %x, 100
ret i128 %1
}
+
+; The following only becomes an sdiv_by_one after type legalisation, after which
+; the splatted scalar constant has a different type to the splat vector. This
+; test verifies DAGCombiner does not care about this type difference.
+define <16 x i16> @combine_vec_sdiv_by_one_obfuscated(<16 x i16> %x) "target-features"="+sve" {
+; CHECK-SD-LABEL: combine_vec_sdiv_by_one_obfuscated:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: combine_vec_sdiv_by_one_obfuscated:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: movi v2.2d, #0000000000000000
+; CHECK-GI-NEXT: movi v3.8h, #1
+; CHECK-GI-NEXT: smov w8, v0.h[0]
+; CHECK-GI-NEXT: mov v3.h[0], v2.h[0]
+; CHECK-GI-NEXT: smov w9, v3.h[0]
+; CHECK-GI-NEXT: smov w16, v3.h[7]
+; CHECK-GI-NEXT: sdiv w14, w8, w9
+; CHECK-GI-NEXT: smov w8, v0.h[1]
+; CHECK-GI-NEXT: smov w9, v3.h[1]
+; CHECK-GI-NEXT: sdiv w15, w8, w9
+; CHECK-GI-NEXT: smov w8, v0.h[2]
+; CHECK-GI-NEXT: smov w9, v3.h[2]
+; CHECK-GI-NEXT: sdiv w13, w8, w9
+; CHECK-GI-NEXT: smov w8, v0.h[3]
+; CHECK-GI-NEXT: smov w9, v3.h[3]
+; CHECK-GI-NEXT: sdiv w12, w8, w9
+; CHECK-GI-NEXT: smov w8, v0.h[4]
+; CHECK-GI-NEXT: smov w9, v3.h[4]
+; CHECK-GI-NEXT: sdiv w11, w8, w9
+; CHECK-GI-NEXT: smov w8, v0.h[5]
+; CHECK-GI-NEXT: smov w9, v3.h[5]
+; CHECK-GI-NEXT: sdiv w10, w8, w9
+; CHECK-GI-NEXT: smov w8, v0.h[6]
+; CHECK-GI-NEXT: smov w9, v3.h[6]
+; CHECK-GI-NEXT: movi v3.8h, #1
+; CHECK-GI-NEXT: smov w17, v3.h[0]
+; CHECK-GI-NEXT: smov w18, v3.h[1]
+; CHECK-GI-NEXT: smov w0, v3.h[2]
+; CHECK-GI-NEXT: smov w1, v3.h[3]
+; CHECK-GI-NEXT: smov w2, v3.h[4]
+; CHECK-GI-NEXT: smov w3, v3.h[5]
+; CHECK-GI-NEXT: sdiv w8, w8, w9
+; CHECK-GI-NEXT: smov w9, v0.h[7]
+; CHECK-GI-NEXT: fmov s0, w14
+; CHECK-GI-NEXT: mov v0.h[1], w15
+; CHECK-GI-NEXT: smov w15, v1.h[6]
+; CHECK-GI-NEXT: mov v0.h[2], w13
+; CHECK-GI-NEXT: sdiv w9, w9, w16
+; CHECK-GI-NEXT: smov w16, v1.h[0]
+; CHECK-GI-NEXT: mov v0.h[3], w12
+; CHECK-GI-NEXT: smov w12, v1.h[7]
+; CHECK-GI-NEXT: mov v0.h[4], w11
+; CHECK-GI-NEXT: sdiv w16, w16, w17
+; CHECK-GI-NEXT: smov w17, v1.h[1]
+; CHECK-GI-NEXT: mov v0.h[5], w10
+; CHECK-GI-NEXT: mov v0.h[6], w8
+; CHECK-GI-NEXT: sdiv w17, w17, w18
+; CHECK-GI-NEXT: smov w18, v1.h[2]
+; CHECK-GI-NEXT: fmov s2, w16
+; CHECK-GI-NEXT: smov w16, v3.h[6]
+; CHECK-GI-NEXT: mov v0.h[7], w9
+; CHECK-GI-NEXT: sdiv w18, w18, w0
+; CHECK-GI-NEXT: smov w0, v1.h[3]
+; CHECK-GI-NEXT: mov v2.h[1], w17
+; CHECK-GI-NEXT: sdiv w0, w0, w1
+; CHECK-GI-NEXT: smov w1, v1.h[4]
+; CHECK-GI-NEXT: mov v2.h[2], w18
+; CHECK-GI-NEXT: sdiv w1, w1, w2
+; CHECK-GI-NEXT: smov w2, v1.h[5]
+; CHECK-GI-NEXT: mov v2.h[3], w0
+; CHECK-GI-NEXT: sdiv w14, w2, w3
+; CHECK-GI-NEXT: mov v2.h[4], w1
+; CHECK-GI-NEXT: sdiv w13, w15, w16
+; CHECK-GI-NEXT: smov w15, v3.h[7]
+; CHECK-GI-NEXT: mov v2.h[5], w14
+; CHECK-GI-NEXT: sdiv w10, w12, w15
+; CHECK-GI-NEXT: mov v2.h[6], w13
+; CHECK-GI-NEXT: mov v2.h[7], w10
+; CHECK-GI-NEXT: mov v1.16b, v2.16b
+; CHECK-GI-NEXT: ret
+ %zero_and_ones = shufflevector <16 x i16> zeroinitializer, <16 x i16> splat (i16 1), <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %div = sdiv <16 x i16> %x, %zero_and_ones
+ ret <16 x i16> %div
+}
diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve-win.mir b/llvm/test/CodeGen/AArch64/framelayout-sve-win.mir
index 5933c5d..b8302e6 100644
--- a/llvm/test/CodeGen/AArch64/framelayout-sve-win.mir
+++ b/llvm/test/CodeGen/AArch64/framelayout-sve-win.mir
@@ -380,10 +380,8 @@ body: |
; CHECK-NEXT: frame-destroy SEH_EpilogStart
; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 32, 0
; CHECK-NEXT: frame-destroy SEH_StackAlloc 32
- ; CHECK-NEXT: $lr = frame-destroy LDRXui $sp, 0 :: (load (s64) from %stack.1)
- ; CHECK-NEXT: frame-destroy SEH_SaveReg 30, 0
- ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0
- ; CHECK-NEXT: frame-destroy SEH_StackAlloc 16
+ ; CHECK-NEXT: early-clobber $sp, $lr = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.1)
+ ; CHECK-NEXT: frame-destroy SEH_SaveReg_X 30, -16
; CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 0 :: (load (s16) from %stack.4)
; CHECK-NEXT: frame-destroy SEH_SavePReg 4, 0
; CHECK-NEXT: $p5 = frame-destroy LDR_PXI $sp, 1 :: (load (s16) from %stack.3)
@@ -430,10 +428,8 @@ body: |
; CHECK-NEXT: frame-destroy SEH_EpilogStart
; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 32, 0
; CHECK-NEXT: frame-destroy SEH_StackAlloc 32
- ; CHECK-NEXT: $lr = frame-destroy LDRXui $sp, 0 :: (load (s64) from %stack.1)
- ; CHECK-NEXT: frame-destroy SEH_SaveReg 30, 0
- ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0
- ; CHECK-NEXT: frame-destroy SEH_StackAlloc 16
+ ; CHECK-NEXT: early-clobber $sp, $lr = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.1)
+ ; CHECK-NEXT: frame-destroy SEH_SaveReg_X 30, -16
; CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.4)
; CHECK-NEXT: frame-destroy SEH_SaveZReg 8, 0
; CHECK-NEXT: $z9 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.3)
@@ -557,10 +553,8 @@ body: |
; CHECK-NEXT: frame-destroy SEH_StackAlloc 32
; CHECK-NEXT: $x21, $lr = frame-destroy LDPXi $sp, 2 :: (load (s64) from %stack.2), (load (s64) from %stack.3)
; CHECK-NEXT: frame-destroy SEH_SaveRegP 21, 30, 16
- ; CHECK-NEXT: $x19, $x20 = frame-destroy LDPXi $sp, 0 :: (load (s64) from %stack.4), (load (s64) from %stack.5)
- ; CHECK-NEXT: frame-destroy SEH_SaveRegP 19, 20, 0
- ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 32, 0
- ; CHECK-NEXT: frame-destroy SEH_StackAlloc 32
+ ; CHECK-NEXT: early-clobber $sp, $x19, $x20 = frame-destroy LDPXpost $sp, 4 :: (load (s64) from %stack.4), (load (s64) from %stack.5)
+ ; CHECK-NEXT: frame-destroy SEH_SaveRegP_X 19, 20, -32
; CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.21)
; CHECK-NEXT: frame-destroy SEH_SaveZReg 8, 2
; CHECK-NEXT: $z9 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.20)
@@ -745,10 +739,8 @@ body: |
; CHECK-NEXT: frame-destroy SEH_EpilogStart
; CHECK-NEXT: $sp = frame-destroy ADDXri $fp, 0, 0
; CHECK-NEXT: frame-destroy SEH_SetFP
- ; CHECK-NEXT: $fp, $lr = frame-destroy LDPXi $sp, 0 :: (load (s64) from %stack.2), (load (s64) from %stack.3)
- ; CHECK-NEXT: frame-destroy SEH_SaveFPLR 0
- ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0
- ; CHECK-NEXT: frame-destroy SEH_StackAlloc 16
+ ; CHECK-NEXT: early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.2), (load (s64) from %stack.3)
+ ; CHECK-NEXT: frame-destroy SEH_SaveFPLR_X -16
; CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.19)
; CHECK-NEXT: frame-destroy SEH_SaveZReg 8, 2
; CHECK-NEXT: $z9 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.18)
@@ -869,10 +861,8 @@ body: |
; CHECK-NEXT: frame-destroy SEH_EpilogStart
; CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 7, implicit $vg
; CHECK-NEXT: frame-destroy SEH_AllocZ 7
- ; CHECK-NEXT: $lr = frame-destroy LDRXui $sp, 0 :: (load (s64) from %stack.6)
- ; CHECK-NEXT: frame-destroy SEH_SaveReg 30, 0
- ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0
- ; CHECK-NEXT: frame-destroy SEH_StackAlloc 16
+ ; CHECK-NEXT: early-clobber $sp, $lr = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.6)
+ ; CHECK-NEXT: frame-destroy SEH_SaveReg_X 30, -16
; CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.8)
; CHECK-NEXT: frame-destroy SEH_SaveZReg 8, 1
; CHECK-NEXT: $z23 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.7)
diff --git a/llvm/test/CodeGen/AArch64/machine-sme-abi-find-insert-pt.mir b/llvm/test/CodeGen/AArch64/machine-sme-abi-find-insert-pt.mir
new file mode 100644
index 0000000..3f174a6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/machine-sme-abi-find-insert-pt.mir
@@ -0,0 +1,227 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=aarch64 -mattr=+sve -mattr=+sme -run-pass=aarch64-machine-sme-abi -verify-machineinstrs %s -o - | FileCheck %s
+
+--- |
+ ; Test moving a state change to be before a $nzcv def
+ define void @move_before_nzcv_def() "aarch64_inout_za" { ret void }
+
+ ; Test moving a state change to a point where $x0 is live
+ define void @move_to_x0_live() "aarch64_inout_za" { ret void }
+
+ ; Test we don't move before a previous state change.
+ define void @do_not_move_before_prior_state_change() "aarch64_za_state_agnostic" { ret void }
+
+ ; Test we don't move into a call sequence.
+ define void @do_not_move_into_call() "aarch64_inout_za" { ret void }
+
+ declare void @clobber()
+ declare void @inout_call() "aarch64_inout_za"
+...
+---
+name: move_before_nzcv_def
+tracksRegLiveness: true
+isSSA: true
+noVRegs: false
+body: |
+ bb.0:
+
+ ; CHECK-LABEL: name: move_before_nzcv_def
+ ; CHECK: [[RDSVLI_XI:%[0-9]+]]:gpr64 = RDSVLI_XI 1, implicit $vg
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $sp
+ ; CHECK-NEXT: [[MSUBXrrr:%[0-9]+]]:gpr64 = MSUBXrrr [[RDSVLI_XI]], [[RDSVLI_XI]], [[COPY]]
+ ; CHECK-NEXT: $sp = COPY [[MSUBXrrr]]
+ ; CHECK-NEXT: STPXi [[MSUBXrrr]], [[RDSVLI_XI]], %stack.0, 0
+ ; CHECK-NEXT: [[ADDXri:%[0-9]+]]:gpr64sp = ADDXri %stack.0, 0, 0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY [[ADDXri]]
+ ; CHECK-NEXT: MSR 56965, [[COPY1]]
+ ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+ ; CHECK-NEXT: RequiresZASavePseudo
+ ; CHECK-NEXT: BL @clobber, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+ ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+ ; CHECK-NEXT: MSRpstatesvcrImm1 2, 1, implicit-def $nzcv
+ ; CHECK-NEXT: [[MRS:%[0-9]+]]:gpr64 = MRS 56965, implicit-def $nzcv
+ ; CHECK-NEXT: $x0 = ADDXri %stack.0, 0, 0
+ ; CHECK-NEXT: RestoreZAPseudo [[MRS]], $x0, &__arm_tpidr2_restore, csr_aarch64_sme_abi_support_routines_preservemost_from_x0
+ ; CHECK-NEXT: MSR 56965, $xzr
+ ; CHECK-NEXT: $nzcv = IMPLICIT_DEF
+ ; CHECK-NEXT: $zab0 = IMPLICIT_DEF
+ ; CHECK-NEXT: FAKE_USE $nzcv
+ ; CHECK-NEXT: RET_ReallyLR
+ ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+ RequiresZASavePseudo
+ BL @clobber, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+ ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+
+ $nzcv = IMPLICIT_DEF
+ $zab0 = IMPLICIT_DEF
+ FAKE_USE $nzcv
+
+ RET_ReallyLR
+...
+---
+name: move_to_x0_live
+tracksRegLiveness: true
+isSSA: true
+noVRegs: false
+body: |
+ bb.0:
+
+ ; CHECK-LABEL: name: move_to_x0_live
+ ; CHECK: [[RDSVLI_XI:%[0-9]+]]:gpr64 = RDSVLI_XI 1, implicit $vg
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $sp
+ ; CHECK-NEXT: [[MSUBXrrr:%[0-9]+]]:gpr64 = MSUBXrrr [[RDSVLI_XI]], [[RDSVLI_XI]], [[COPY]]
+ ; CHECK-NEXT: $sp = COPY [[MSUBXrrr]]
+ ; CHECK-NEXT: STPXi [[MSUBXrrr]], [[RDSVLI_XI]], %stack.0, 0
+ ; CHECK-NEXT: [[ADDXri:%[0-9]+]]:gpr64sp = ADDXri %stack.0, 0, 0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY [[ADDXri]]
+ ; CHECK-NEXT: MSR 56965, [[COPY1]]
+ ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+ ; CHECK-NEXT: RequiresZASavePseudo
+ ; CHECK-NEXT: BL @clobber, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+ ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+ ; CHECK-NEXT: $x0 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY $x0
+ ; CHECK-NEXT: MSRpstatesvcrImm1 2, 1, implicit-def $nzcv
+ ; CHECK-NEXT: [[MRS:%[0-9]+]]:gpr64 = MRS 56965, implicit-def $nzcv
+ ; CHECK-NEXT: $x0 = ADDXri %stack.0, 0, 0
+ ; CHECK-NEXT: RestoreZAPseudo [[MRS]], $x0, &__arm_tpidr2_restore, csr_aarch64_sme_abi_support_routines_preservemost_from_x0
+ ; CHECK-NEXT: MSR 56965, $xzr
+ ; CHECK-NEXT: $x0 = COPY [[COPY2]]
+ ; CHECK-NEXT: $nzcv = IMPLICIT_DEF
+ ; CHECK-NEXT: FAKE_USE $x0
+ ; CHECK-NEXT: $zab0 = IMPLICIT_DEF
+ ; CHECK-NEXT: FAKE_USE $nzcv
+ ; CHECK-NEXT: RET_ReallyLR
+ ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+ RequiresZASavePseudo
+ BL @clobber, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+ ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+
+ $x0 = IMPLICIT_DEF
+
+ $nzcv = IMPLICIT_DEF
+ FAKE_USE $x0
+
+ $zab0 = IMPLICIT_DEF
+ FAKE_USE $nzcv
+
+ RET_ReallyLR
+...
+---
+name: do_not_move_before_prior_state_change
+tracksRegLiveness: true
+isSSA: true
+noVRegs: false
+body: |
+ ; CHECK-LABEL: name: do_not_move_before_prior_state_change
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: BL &__arm_sme_state_size, csr_aarch64_sme_abi_support_routines_preservemost_from_x1, implicit-def $lr, implicit $sp, implicit-def $x0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+ ; CHECK-NEXT: $sp = SUBXrx64 $sp, [[COPY]], 24
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $sp
+ ; CHECK-NEXT: $nzcv = IMPLICIT_DEF
+ ; CHECK-NEXT: $zab0 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[MRS:%[0-9]+]]:gpr64 = MRS 55824, implicit-def $nzcv, implicit $nzcv
+ ; CHECK-NEXT: $x0 = COPY [[COPY1]]
+ ; CHECK-NEXT: BL &__arm_sme_save, csr_aarch64_sme_abi_support_routines_preservemost_from_x1, implicit-def $lr, implicit $sp, implicit $x0
+ ; CHECK-NEXT: MSR 55824, [[MRS]], implicit-def $nzcv
+ ; CHECK-NEXT: Bcc 2, %bb.1, implicit $nzcv
+ ; CHECK-NEXT: B %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: liveins: $nzcv
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: FAKE_USE $nzcv
+ ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+ ; CHECK-NEXT: RequiresZASavePseudo
+ ; CHECK-NEXT: BL @clobber, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+ ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+ ; CHECK-NEXT: $x0 = COPY [[COPY1]]
+ ; CHECK-NEXT: BL &__arm_sme_restore, csr_aarch64_sme_abi_support_routines_preservemost_from_x1, implicit-def $lr, implicit $sp, implicit $x0
+ ; CHECK-NEXT: RET_ReallyLR
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+ ; CHECK-NEXT: RequiresZASavePseudo
+ ; CHECK-NEXT: BL @clobber, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+ ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+ ; CHECK-NEXT: $x0 = COPY [[COPY1]]
+ ; CHECK-NEXT: BL &__arm_sme_restore, csr_aarch64_sme_abi_support_routines_preservemost_from_x1, implicit-def $lr, implicit $sp, implicit $x0
+ ; CHECK-NEXT: RET_ReallyLR
+ bb.0:
+ successors: %bb.1, %bb.2
+
+ ; The insertion point can move before the $nzcv def (as that would require
+ ; moving before a $zab0 def -- that requires the ACTIVE state).
+ $nzcv = IMPLICIT_DEF
+ $zab0 = IMPLICIT_DEF
+ Bcc 2, %bb.1, implicit $nzcv
+ B %bb.2
+ ; bb.1 and bb.2 both require ZA saved on entry (to force bb.0's exit bundle to
+ ; pick the LOCAL_SAVED state).
+ bb.1:
+ liveins: $nzcv
+ FAKE_USE $nzcv
+
+ ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+ RequiresZASavePseudo
+ BL @clobber, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+ ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+
+ RET_ReallyLR
+ bb.2:
+ ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+ RequiresZASavePseudo
+ BL @clobber, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+ ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+
+ RET_ReallyLR
+...
+---
+name: do_not_move_into_call
+tracksRegLiveness: true
+isSSA: true
+noVRegs: false
+body: |
+ bb.0:
+
+ ; CHECK-LABEL: name: do_not_move_into_call
+ ; CHECK: [[RDSVLI_XI:%[0-9]+]]:gpr64 = RDSVLI_XI 1, implicit $vg
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $sp
+ ; CHECK-NEXT: [[MSUBXrrr:%[0-9]+]]:gpr64 = MSUBXrrr [[RDSVLI_XI]], [[RDSVLI_XI]], [[COPY]]
+ ; CHECK-NEXT: $sp = COPY [[MSUBXrrr]]
+ ; CHECK-NEXT: STPXi [[MSUBXrrr]], [[RDSVLI_XI]], %stack.0, 0
+ ; CHECK-NEXT: [[ADDXri:%[0-9]+]]:gpr64sp = ADDXri %stack.0, 0, 0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY [[ADDXri]]
+ ; CHECK-NEXT: MSR 56965, [[COPY1]]
+ ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+ ; CHECK-NEXT: RequiresZASavePseudo
+ ; CHECK-NEXT: BL @clobber, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+ ; CHECK-NEXT: $nzcv = IMPLICIT_DEF
+ ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+ ; CHECK-NEXT: [[MRS:%[0-9]+]]:gpr64 = MRS 55824, implicit-def $nzcv, implicit $nzcv
+ ; CHECK-NEXT: MSRpstatesvcrImm1 2, 1, implicit-def $nzcv
+ ; CHECK-NEXT: [[MRS1:%[0-9]+]]:gpr64 = MRS 56965, implicit-def $nzcv
+ ; CHECK-NEXT: $x0 = ADDXri %stack.0, 0, 0
+ ; CHECK-NEXT: RestoreZAPseudo [[MRS1]], $x0, &__arm_tpidr2_restore, csr_aarch64_sme_abi_support_routines_preservemost_from_x0
+ ; CHECK-NEXT: MSR 56965, $xzr
+ ; CHECK-NEXT: MSR 55824, [[MRS]], implicit-def $nzcv
+ ; CHECK-NEXT: $zab0 = IMPLICIT_DEF
+ ; CHECK-NEXT: FAKE_USE $nzcv
+ ; CHECK-NEXT: RET_ReallyLR
+ ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+ RequiresZASavePseudo
+ BL @clobber, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+
+ ; This is artificial test where NZCV is def'd inside a call, so we can't
+ ; move the insert point before it's definition.
+ $nzcv = IMPLICIT_DEF
+ ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+
+ $zab0 = IMPLICIT_DEF
+ FAKE_USE $nzcv
+
+ RET_ReallyLR
+...
diff --git a/llvm/test/CodeGen/AArch64/mir-yaml-has-streaming-mode-changes.ll b/llvm/test/CodeGen/AArch64/mir-yaml-has-streaming-mode-changes.ll
new file mode 100644
index 0000000..8f1fe5c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/mir-yaml-has-streaming-mode-changes.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple=aarch64 -mattr=+sme -stop-after=aarch64-isel < %s | FileCheck %s
+
+target triple = "aarch64"
+
+declare void @foo() "aarch64_pstate_sm_enabled"
+
+define dso_local void @bar() local_unnamed_addr {
+; CHECK-LABEL: name: bar
+; CHECK: hasStreamingModeChanges: true
+entry:
+ tail call void @foo() "aarch64_pstate_sm_enabled"
+ ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/sat-add.ll b/llvm/test/CodeGen/AArch64/sat-add.ll
index ecd48d6..149b4c4 100644
--- a/llvm/test/CodeGen/AArch64/sat-add.ll
+++ b/llvm/test/CodeGen/AArch64/sat-add.ll
@@ -290,8 +290,7 @@ define i32 @unsigned_sat_variable_i32_using_cmp_sum(i32 %x, i32 %y) {
define i32 @unsigned_sat_variable_i32_using_cmp_notval(i32 %x, i32 %y) {
; CHECK-LABEL: unsigned_sat_variable_i32_using_cmp_notval:
; CHECK: // %bb.0:
-; CHECK-NEXT: add w8, w0, w1
-; CHECK-NEXT: cmn w1, w0
+; CHECK-NEXT: adds w8, w1, w0
; CHECK-NEXT: csinv w0, w8, wzr, lo
; CHECK-NEXT: ret
%noty = xor i32 %y, -1
@@ -331,8 +330,7 @@ define i64 @unsigned_sat_variable_i64_using_cmp_sum(i64 %x, i64 %y) {
define i64 @unsigned_sat_variable_i64_using_cmp_notval(i64 %x, i64 %y) {
; CHECK-LABEL: unsigned_sat_variable_i64_using_cmp_notval:
; CHECK: // %bb.0:
-; CHECK-NEXT: add x8, x0, x1
-; CHECK-NEXT: cmn x1, x0
+; CHECK-NEXT: adds x8, x1, x0
; CHECK-NEXT: csinv x0, x8, xzr, lo
; CHECK-NEXT: ret
%noty = xor i64 %y, -1
diff --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
index e3007a3..e4f9efa 100644
--- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
+++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
@@ -391,11 +391,9 @@ define void @agnostic_za_buffer_alloc_with_stack_probes() nounwind "aarch64_za_s
; CHECK-NEWLOWERING-NEXT: sub x19, x8, x0
; CHECK-NEWLOWERING-NEXT: .LBB7_1: // =>This Inner Loop Header: Depth=1
; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16, lsl #12 // =65536
-; CHECK-NEWLOWERING-NEXT: cmp sp, x19
; CHECK-NEWLOWERING-NEXT: mov x0, x19
-; CHECK-NEWLOWERING-NEXT: mrs x8, NZCV
; CHECK-NEWLOWERING-NEXT: bl __arm_sme_save
-; CHECK-NEWLOWERING-NEXT: msr NZCV, x8
+; CHECK-NEWLOWERING-NEXT: cmp sp, x19
; CHECK-NEWLOWERING-NEXT: b.le .LBB7_3
; CHECK-NEWLOWERING-NEXT: // %bb.2: // in Loop: Header=BB7_1 Depth=1
; CHECK-NEWLOWERING-NEXT: mov x0, x19
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-sve-nzcv-live.mir b/llvm/test/CodeGen/AArch64/sme-lazy-sve-nzcv-live.mir
index 18764d5..9f33c06 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-sve-nzcv-live.mir
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-sve-nzcv-live.mir
@@ -62,14 +62,12 @@ body: |
; CHECK-NEXT: RequiresZASavePseudo
; CHECK-NEXT: BL @clobber, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
- ; CHECK-NEXT: [[SUBSWri:%[0-9]+]]:gpr32 = SUBSWri [[COPY1]], 101, 0, implicit-def $nzcv
- ; CHECK-NEXT: [[MRS:%[0-9]+]]:gpr64 = MRS 55824, implicit-def $nzcv, implicit $nzcv
; CHECK-NEXT: MSRpstatesvcrImm1 2, 1, implicit-def $nzcv
; CHECK-NEXT: [[MRS1:%[0-9]+]]:gpr64 = MRS 56965, implicit-def $nzcv
; CHECK-NEXT: $x0 = ADDXri %stack.0, 0, 0
; CHECK-NEXT: RestoreZAPseudo [[MRS1]], $x0, &__arm_tpidr2_restore, csr_aarch64_sme_abi_support_routines_preservemost_from_x0
; CHECK-NEXT: MSR 56965, $xzr
- ; CHECK-NEXT: MSR 55824, [[MRS]], implicit-def $nzcv
+ ; CHECK-NEXT: [[SUBSWri:%[0-9]+]]:gpr32 = SUBSWri [[COPY1]], 101, 0, implicit-def $nzcv
; CHECK-NEXT: Bcc 11, %bb.2, implicit $nzcv
; CHECK-NEXT: B %bb.1
; CHECK-NEXT: {{ $}}
@@ -116,16 +114,14 @@ body: |
# CHECK-ASM-LABEL: cmp_branch
# CHECK-ASM: msr TPIDR2_EL0, x10
# CHECK-ASM-NEXT: bl clobber
-# CHECK-ASM-NEXT: cmp w20, #101
-# CHECK-ASM-NEXT: mrs x8, NZCV
# CHECK-ASM-NEXT: smstart za
-# CHECK-ASM-NEXT: mrs x9, TPIDR2_EL0
+# CHECK-ASM-NEXT: mrs x8, TPIDR2_EL0
# CHECK-ASM-NEXT: sub x0, x29, #16
-# CHECK-ASM-NEXT: cbnz x9, .LBB0_2
+# CHECK-ASM-NEXT: cbnz x8, .LBB0_2
# CHECK-ASM: bl __arm_tpidr2_restore
# CHECK-ASM-NEXT: .LBB0_2:
+# CHECK-ASM-NEXT: cmp w20, #101
# CHECK-ASM-NEXT: msr TPIDR2_EL0, xzr
-# CHECK-ASM-NEXT: msr NZCV, x8
# CHECK-ASM-NEXT: b.lt .LBB0_4
# CHECK-ASM: bl inout_call
# CHECK-ASM-NEXT: .LBB0_4:
diff --git a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
index b6dee97e..b8d6c88 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
@@ -732,6 +732,247 @@ exit:
ret void
}
+; This example corresponds to:
+;
+; __arm_agnostic("sme_za_state") void try_catch_agnostic_za_invoke()
+; {
+; try {
+; agnostic_za_call();
+; } catch(...) {
+; }
+; }
+;
+; In this example we preserve all SME state enabled by PSTATE.ZA using
+; `__arm_sme_save` before agnostic_za_call(). This is because on all normal
+; returns from an agnostic ZA function ZA state should be preserved. That means
+; we need to make sure ZA state is saved in case agnostic_za_call() throws, and
+; we need to restore ZA state after unwinding to the catch block.
+
+define void @try_catch_agnostic_za_invoke() "aarch64_za_state_agnostic" personality ptr @__gxx_personality_v0 {
+; CHECK-LABEL: try_catch_agnostic_za_invoke:
+; CHECK: .Lfunc_begin5:
+; CHECK-NEXT: .cfi_startproc
+; CHECK-NEXT: .cfi_personality 156, DW.ref.__gxx_personality_v0
+; CHECK-NEXT: .cfi_lsda 28, .Lexception5
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: .cfi_def_cfa w29, 32
+; CHECK-NEXT: .cfi_offset w19, -16
+; CHECK-NEXT: .cfi_offset w30, -24
+; CHECK-NEXT: .cfi_offset w29, -32
+; CHECK-NEXT: bl __arm_sme_state_size
+; CHECK-NEXT: sub sp, sp, x0
+; CHECK-NEXT: mov x19, sp
+; CHECK-NEXT: .Ltmp15: // EH_LABEL
+; CHECK-NEXT: mov x0, x19
+; CHECK-NEXT: bl __arm_sme_save
+; CHECK-NEXT: bl agnostic_za_call
+; CHECK-NEXT: .Ltmp16: // EH_LABEL
+; CHECK-NEXT: .LBB5_1: // %exit
+; CHECK-NEXT: mov x0, x19
+; CHECK-NEXT: bl __arm_sme_restore
+; CHECK-NEXT: mov sp, x29
+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB5_2: // %catch
+; CHECK-NEXT: .Ltmp17: // EH_LABEL
+; CHECK-NEXT: bl __cxa_begin_catch
+; CHECK-NEXT: bl __cxa_end_catch
+; CHECK-NEXT: b .LBB5_1
+;
+; CHECK-SDAG-LABEL: try_catch_agnostic_za_invoke:
+; CHECK-SDAG: .Lfunc_begin5:
+; CHECK-SDAG-NEXT: .cfi_startproc
+; CHECK-SDAG-NEXT: .cfi_personality 156, DW.ref.__gxx_personality_v0
+; CHECK-SDAG-NEXT: .cfi_lsda 28, .Lexception5
+; CHECK-SDAG-NEXT: // %bb.0: // %entry
+; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-SDAG-NEXT: mov x29, sp
+; CHECK-SDAG-NEXT: .cfi_def_cfa w29, 32
+; CHECK-SDAG-NEXT: .cfi_offset w19, -16
+; CHECK-SDAG-NEXT: .cfi_offset w30, -24
+; CHECK-SDAG-NEXT: .cfi_offset w29, -32
+; CHECK-SDAG-NEXT: bl __arm_sme_state_size
+; CHECK-SDAG-NEXT: sub sp, sp, x0
+; CHECK-SDAG-NEXT: mov x19, sp
+; CHECK-SDAG-NEXT: .Ltmp15: // EH_LABEL
+; CHECK-SDAG-NEXT: mov x0, x19
+; CHECK-SDAG-NEXT: bl __arm_sme_save
+; CHECK-SDAG-NEXT: bl agnostic_za_call
+; CHECK-SDAG-NEXT: mov x0, x19
+; CHECK-SDAG-NEXT: bl __arm_sme_restore
+; CHECK-SDAG-NEXT: .Ltmp16: // EH_LABEL
+; CHECK-SDAG-NEXT: .LBB5_1: // %exit
+; CHECK-SDAG-NEXT: mov sp, x29
+; CHECK-SDAG-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ret
+; CHECK-SDAG-NEXT: .LBB5_2: // %catch
+; CHECK-SDAG-NEXT: .Ltmp17: // EH_LABEL
+; CHECK-SDAG-NEXT: mov x1, x0
+; CHECK-SDAG-NEXT: mov x0, x19
+; CHECK-SDAG-NEXT: bl __arm_sme_restore
+; CHECK-SDAG-NEXT: mov x0, x19
+; CHECK-SDAG-NEXT: bl __arm_sme_save
+; CHECK-SDAG-NEXT: mov x0, x1
+; CHECK-SDAG-NEXT: bl __cxa_begin_catch
+; CHECK-SDAG-NEXT: mov x0, x19
+; CHECK-SDAG-NEXT: bl __arm_sme_restore
+; CHECK-SDAG-NEXT: mov x0, x19
+; CHECK-SDAG-NEXT: bl __arm_sme_save
+; CHECK-SDAG-NEXT: bl __cxa_end_catch
+; CHECK-SDAG-NEXT: mov x0, x19
+; CHECK-SDAG-NEXT: bl __arm_sme_restore
+; CHECK-SDAG-NEXT: b .LBB5_1
+entry:
+ invoke void @agnostic_za_call()
+ to label %exit unwind label %catch
+
+catch:
+ %eh_info = landingpad { ptr, i32 }
+ catch ptr null
+ %exception_ptr = extractvalue { ptr, i32 } %eh_info, 0
+ tail call ptr @__cxa_begin_catch(ptr %exception_ptr)
+ tail call void @__cxa_end_catch()
+ br label %exit
+
+exit:
+ ret void
+}
+
+; This is the same `try_catch_agnostic_za_invoke`, but shows a lazy save would
+; also need to be committed in a shared-ZA function calling an agnostic-ZA function.
+define void @try_catch_inout_za_agnostic_za_callee() "aarch64_inout_za" personality ptr @__gxx_personality_v0 {
+; CHECK-LABEL: try_catch_inout_za_agnostic_za_callee:
+; CHECK: .Lfunc_begin6:
+; CHECK-NEXT: .cfi_startproc
+; CHECK-NEXT: .cfi_personality 156, DW.ref.__gxx_personality_v0
+; CHECK-NEXT: .cfi_lsda 28, .Lexception6
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: .cfi_def_cfa w29, 16
+; CHECK-NEXT: .cfi_offset w30, -8
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: msub x9, x8, x8, x9
+; CHECK-NEXT: mov sp, x9
+; CHECK-NEXT: stp x9, x8, [x29, #-16]
+; CHECK-NEXT: .Ltmp18: // EH_LABEL
+; CHECK-NEXT: sub x8, x29, #16
+; CHECK-NEXT: msr TPIDR2_EL0, x8
+; CHECK-NEXT: bl agnostic_za_call
+; CHECK-NEXT: .Ltmp19: // EH_LABEL
+; CHECK-NEXT: .LBB6_1: // %exit
+; CHECK-NEXT: smstart za
+; CHECK-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-NEXT: sub x0, x29, #16
+; CHECK-NEXT: cbnz x8, .LBB6_3
+; CHECK-NEXT: // %bb.2: // %exit
+; CHECK-NEXT: bl __arm_tpidr2_restore
+; CHECK-NEXT: .LBB6_3: // %exit
+; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: mov sp, x29
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB6_4: // %catch
+; CHECK-NEXT: .Ltmp20: // EH_LABEL
+; CHECK-NEXT: bl __cxa_begin_catch
+; CHECK-NEXT: bl __cxa_end_catch
+; CHECK-NEXT: b .LBB6_1
+;
+; CHECK-SDAG-LABEL: try_catch_inout_za_agnostic_za_callee:
+; CHECK-SDAG: .Lfunc_begin6:
+; CHECK-SDAG-NEXT: .cfi_startproc
+; CHECK-SDAG-NEXT: .cfi_personality 156, DW.ref.__gxx_personality_v0
+; CHECK-SDAG-NEXT: .cfi_lsda 28, .Lexception6
+; CHECK-SDAG-NEXT: // %bb.0: // %entry
+; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SDAG-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-SDAG-NEXT: mov x29, sp
+; CHECK-SDAG-NEXT: sub sp, sp, #16
+; CHECK-SDAG-NEXT: .cfi_def_cfa w29, 32
+; CHECK-SDAG-NEXT: .cfi_offset w19, -16
+; CHECK-SDAG-NEXT: .cfi_offset w30, -24
+; CHECK-SDAG-NEXT: .cfi_offset w29, -32
+; CHECK-SDAG-NEXT: rdsvl x8, #1
+; CHECK-SDAG-NEXT: mov x9, sp
+; CHECK-SDAG-NEXT: msub x9, x8, x8, x9
+; CHECK-SDAG-NEXT: mov sp, x9
+; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-16]
+; CHECK-SDAG-NEXT: .Ltmp18: // EH_LABEL
+; CHECK-SDAG-NEXT: sub x19, x29, #16
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x19
+; CHECK-SDAG-NEXT: bl agnostic_za_call
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT: sub x0, x29, #16
+; CHECK-SDAG-NEXT: cbnz x8, .LBB6_2
+; CHECK-SDAG-NEXT: // %bb.1: // %entry
+; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT: .LBB6_2: // %entry
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT: .Ltmp19: // EH_LABEL
+; CHECK-SDAG-NEXT: .LBB6_3: // %exit
+; CHECK-SDAG-NEXT: mov sp, x29
+; CHECK-SDAG-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-SDAG-NEXT: ret
+; CHECK-SDAG-NEXT: .LBB6_4: // %catch
+; CHECK-SDAG-NEXT: .Ltmp20: // EH_LABEL
+; CHECK-SDAG-NEXT: mov x1, x0
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT: sub x0, x29, #16
+; CHECK-SDAG-NEXT: cbnz x8, .LBB6_6
+; CHECK-SDAG-NEXT: // %bb.5: // %catch
+; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT: .LBB6_6: // %catch
+; CHECK-SDAG-NEXT: mov x0, x1
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x19
+; CHECK-SDAG-NEXT: bl __cxa_begin_catch
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT: sub x0, x29, #16
+; CHECK-SDAG-NEXT: cbnz x8, .LBB6_8
+; CHECK-SDAG-NEXT: // %bb.7: // %catch
+; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT: .LBB6_8: // %catch
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x19
+; CHECK-SDAG-NEXT: bl __cxa_end_catch
+; CHECK-SDAG-NEXT: smstart za
+; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT: sub x0, x29, #16
+; CHECK-SDAG-NEXT: cbnz x8, .LBB6_10
+; CHECK-SDAG-NEXT: // %bb.9: // %catch
+; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT: .LBB6_10: // %catch
+; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT: b .LBB6_3
+entry:
+ invoke void @agnostic_za_call()
+ to label %exit unwind label %catch
+
+catch:
+ %eh_info = landingpad { ptr, i32 }
+ catch ptr null
+ %exception_ptr = extractvalue { ptr, i32 } %eh_info, 0
+ tail call ptr @__cxa_begin_catch(ptr %exception_ptr)
+ tail call void @__cxa_end_catch()
+ br label %exit
+
+exit:
+ ret void
+}
+
declare ptr @__cxa_allocate_exception(i64)
declare void @__cxa_throw(ptr, ptr, ptr)
declare ptr @__cxa_begin_catch(ptr)
@@ -742,3 +983,4 @@ declare void @may_throw()
declare void @shared_za_call() "aarch64_inout_za"
declare void @noexcept_shared_za_call() "aarch64_inout_za"
declare void @shared_zt0_call() "aarch64_inout_zt0"
+declare void @agnostic_za_call() "aarch64_za_state_agnostic"
diff --git a/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll
index 15ee6a0..36655f6 100644
--- a/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll
@@ -359,12 +359,177 @@ define float @fadd_reduct_reassoc_v4v8f32(<vscale x 4 x float> %a, <vscale x 8 x
ret float %r
}
+; No FMULV instruction so use knowledge about the architectural maximum size of
+; an SVE register to "scalarise" the reduction.
+
+define half @fmulv_nxv2f16(half %init, <vscale x 2 x half> %a) {
+; CHECK-LABEL: fmulv_nxv2f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov z2.h, #1.00000000
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT: fmul h0, h0, h1
+; CHECK-NEXT: ret
+ %res = call fast half @llvm.vector.reduce.fmul.nxv2f16(half %init, <vscale x 2 x half> %a)
+ ret half %res
+}
+
+define half @fmulv_nxv4f16(half %init, <vscale x 4 x half> %a) {
+; CHECK-LABEL: fmulv_nxv4f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov z2.h, #1.00000000
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT: fmul h0, h0, h1
+; CHECK-NEXT: ret
+ %res = call fast half @llvm.vector.reduce.fmul.nxv4f16(half %init, <vscale x 4 x half> %a)
+ ret half %res
+}
+
+define half @fmulv_nxv8f16(half %init, <vscale x 8 x half> %a) {
+; CHECK-LABEL: fmulv_nxv8f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov z2.h, #1.00000000
+; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
+; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT: fmul z1.h, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
+; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT: fmul z1.h, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
+; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT: fmul z1.h, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
+; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT: fmul z1.h, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
+; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT: fmul z1.h, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
+; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT: fmul z1.h, z1.h, z3.h
+; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
+; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT: fmul z1.h, z1.h, z3.h
+; CHECK-NEXT: fmul h0, h0, h1
+; CHECK-NEXT: ret
+ %res = call fast half @llvm.vector.reduce.fmul.nxv8f16(half %init, <vscale x 8 x half> %a)
+ ret half %res
+}
+
+define float @fmulv_nxv2f32(float %init, <vscale x 2 x float> %a) {
+; CHECK-LABEL: fmulv_nxv2f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov z2.s, #1.00000000
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s
+; CHECK-NEXT: fmul s0, s0, s1
+; CHECK-NEXT: ret
+ %res = call fast float @llvm.vector.reduce.fmul.nxv2f32(float %init, <vscale x 2 x float> %a)
+ ret float %res
+}
+
+define float @fmulv_nxv4f32(float %init, <vscale x 4 x float> %a) {
+; CHECK-LABEL: fmulv_nxv4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov z2.s, #1.00000000
+; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT: fmul z1.s, z1.s, z3.s
+; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT: fmul z1.s, z1.s, z3.s
+; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT: fmul z1.s, z1.s, z3.s
+; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT: fmul z1.s, z1.s, z3.s
+; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT: fmul z1.s, z1.s, z3.s
+; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT: fmul z1.s, z1.s, z3.s
+; CHECK-NEXT: fmul s0, s0, s1
+; CHECK-NEXT: ret
+ %res = call fast float @llvm.vector.reduce.fmul.nxv4f32(float %init, <vscale x 4 x float> %a)
+ ret float %res
+}
+
+define double @fmulv_nxv2f64(double %init, <vscale x 2 x double> %a) {
+; CHECK-LABEL: fmulv_nxv2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov z2.d, #1.00000000
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.d, z1.d, z3.d
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.d, z1.d, z3.d
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.d, z1.d, z3.d
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.d, z1.d, z3.d
+; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT: fmul z1.d, z1.d, z3.d
+; CHECK-NEXT: fmul d0, d0, d1
+; CHECK-NEXT: ret
+ %res = call fast double @llvm.vector.reduce.fmul.nxv2f64(double %init, <vscale x 2 x double> %a)
+ ret double %res
+}
+
declare half @llvm.vector.reduce.fadd.nxv2f16(half, <vscale x 2 x half>)
declare half @llvm.vector.reduce.fadd.nxv4f16(half, <vscale x 4 x half>)
declare half @llvm.vector.reduce.fadd.nxv8f16(half, <vscale x 8 x half>)
-declare half @llvm.vector.reduce.fadd.nxv6f16(half, <vscale x 6 x half>)
-declare half @llvm.vector.reduce.fadd.nxv10f16(half, <vscale x 10 x half>)
-declare half @llvm.vector.reduce.fadd.nxv12f16(half, <vscale x 12 x half>)
declare float @llvm.vector.reduce.fadd.nxv2f32(float, <vscale x 2 x float>)
declare float @llvm.vector.reduce.fadd.nxv4f32(float, <vscale x 4 x float>)
declare float @llvm.vector.reduce.fadd.nxv8f32(float, <vscale x 8 x float>)
@@ -397,3 +562,10 @@ declare half @llvm.vector.reduce.fminimum.nxv8f16(<vscale x 8 x half>)
declare float @llvm.vector.reduce.fminimum.nxv2f32(<vscale x 2 x float>)
declare float @llvm.vector.reduce.fminimum.nxv4f32(<vscale x 4 x float>)
declare double @llvm.vector.reduce.fminimum.nxv2f64(<vscale x 2 x double>)
+
+declare half @llvm.vector.reduce.fmul.nxv2f16(half, <vscale x 2 x half>)
+declare half @llvm.vector.reduce.fmul.nxv4f16(half, <vscale x 4 x half>)
+declare half @llvm.vector.reduce.fmul.nxv8f16(half, <vscale x 8 x half>)
+declare float @llvm.vector.reduce.fmul.nxv2f32(float, <vscale x 2 x float>)
+declare float @llvm.vector.reduce.fmul.nxv4f32(float, <vscale x 4 x float>)
+declare double @llvm.vector.reduce.fmul.nxv2f64(double, <vscale x 2 x double>)
diff --git a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
index be936f0..6fb0315 100644
--- a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
@@ -369,6 +369,131 @@ define i64 @smax_nxv2i64(<vscale x 2 x i64> %a) {
ret i64 %res
}
+; No MULV instruction so use knowledge about the architectural maximum size of
+; an SVE register to "scalarise" the reduction.
+
+define i8 @mulv_nxv16i8(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: mulv_nxv16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.b, #1 // =0x1
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT: uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+ %res = call i8 @llvm.vector.reduce.mul.nxv16i8(<vscale x 16 x i8> %a)
+ ret i8 %res
+}
+
+define i16 @mulv_nxv8i16(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: mulv_nxv8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.h, #1 // =0x1
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT: uzp2 z2.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+ %res = call i16 @llvm.vector.reduce.mul.nxv8i16(<vscale x 8 x i16> %a)
+ ret i16 %res
+}
+
+define i32 @mulv_nxv4i32(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: mulv_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.s, #1 // =0x1
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT: uzp2 z2.s, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+ %res = call i32 @llvm.vector.reduce.mul.nxv4i32(<vscale x 4 x i32> %a)
+ ret i32 %res
+}
+
+define i64 @mulv_nxv2i64(<vscale x 2 x i64> %a) {
+; CHECK-LABEL: mulv_nxv2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.d, #1 // =0x1
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uzp2 z2.d, z0.d, z1.d
+; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d
+; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT: uzp2 z2.d, z0.d, z1.d
+; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d
+; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT: uzp2 z2.d, z0.d, z1.d
+; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d
+; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT: uzp2 z2.d, z0.d, z1.d
+; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d
+; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT: uzp2 z2.d, z0.d, z1.d
+; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d
+; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
+ %res = call i64 @llvm.vector.reduce.mul.nxv2i64(<vscale x 2 x i64> %a)
+ ret i64 %res
+}
+
; Test widen vector reduce type
declare i8 @llvm.vector.reduce.smin.nxv10i8(<vscale x 10 x i8>)
diff --git a/llvm/test/CodeGen/AArch64/win-sve.ll b/llvm/test/CodeGen/AArch64/win-sve.ll
index 53ac934..3ba4a1c 100644
--- a/llvm/test/CodeGen/AArch64/win-sve.ll
+++ b/llvm/test/CodeGen/AArch64/win-sve.ll
@@ -75,10 +75,8 @@ define i32 @f(<vscale x 2 x i64> %x) {
; CHECK-NEXT: .seh_startepilogue
; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: .seh_save_reg x30, 8
-; CHECK-NEXT: ldr x28, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x28, 0
-; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: .seh_stackalloc 16
+; CHECK-NEXT: ldr x28, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x28, 16
; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 2
; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -234,10 +232,8 @@ define void @f2(i64 %n, <vscale x 2 x i64> %x) {
; CHECK-NEXT: .seh_save_fplr 16
; CHECK-NEXT: ldr x28, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: .seh_save_reg x28, 8
-; CHECK-NEXT: ldr x19, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x19, 0
-; CHECK-NEXT: add sp, sp, #32
-; CHECK-NEXT: .seh_stackalloc 32
+; CHECK-NEXT: ldr x19, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x19, 32
; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 2
; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -384,10 +380,8 @@ define void @f3(i64 %n, <vscale x 2 x i64> %x) {
; CHECK-NEXT: .seh_stackalloc 16
; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: .seh_save_reg x30, 8
-; CHECK-NEXT: ldr x28, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x28, 0
-; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: .seh_stackalloc 16
+; CHECK-NEXT: ldr x28, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x28, 16
; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 2
; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -538,10 +532,8 @@ define void @f4(i64 %n, <vscale x 2 x i64> %x) {
; CHECK-NEXT: .seh_stackalloc 16
; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: .seh_save_reg x30, 8
-; CHECK-NEXT: ldr x28, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x28, 0
-; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: .seh_stackalloc 16
+; CHECK-NEXT: ldr x28, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x28, 16
; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 2
; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -702,10 +694,8 @@ define void @f5(i64 %n, <vscale x 2 x i64> %x) {
; CHECK-NEXT: .seh_save_fplr 16
; CHECK-NEXT: ldr x28, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: .seh_save_reg x28, 8
-; CHECK-NEXT: ldr x19, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x19, 0
-; CHECK-NEXT: add sp, sp, #32
-; CHECK-NEXT: .seh_stackalloc 32
+; CHECK-NEXT: ldr x19, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x19, 32
; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 2
; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -860,10 +850,10 @@ define void @f6(<vscale x 2 x i64> %x, [8 x i64] %pad, i64 %n9) personality ptr
; CHECK-NEXT: stur x0, [x8, #16]
; CHECK-NEXT: addvl x8, x29, #18
; CHECK-NEXT: ldr x1, [x8, #32]
-; CHECK-NEXT: .Ltmp0:
+; CHECK-NEXT: .Ltmp0: // EH_LABEL
; CHECK-NEXT: add x0, x19, #0
; CHECK-NEXT: bl g6
-; CHECK-NEXT: .Ltmp1:
+; CHECK-NEXT: .Ltmp1: // EH_LABEL
; CHECK-NEXT: // %bb.1: // %invoke.cont
; CHECK-NEXT: .seh_startepilogue
; CHECK-NEXT: add sp, sp, #64
@@ -872,10 +862,8 @@ define void @f6(<vscale x 2 x i64> %x, [8 x i64] %pad, i64 %n9) personality ptr
; CHECK-NEXT: .seh_save_fplr 16
; CHECK-NEXT: ldr x28, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: .seh_save_reg x28, 8
-; CHECK-NEXT: ldr x19, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x19, 0
-; CHECK-NEXT: add sp, sp, #32
-; CHECK-NEXT: .seh_stackalloc 32
+; CHECK-NEXT: ldr x19, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x19, 32
; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 2
; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -932,8 +920,6 @@ define void @f6(<vscale x 2 x i64> %x, [8 x i64] %pad, i64 %n9) personality ptr
; CHECK-NEXT: .seh_save_preg p14, 10
; CHECK-NEXT: ldr p15, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: .seh_save_preg p15, 11
-; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: .seh_stackalloc 16
; CHECK-NEXT: addvl sp, sp, #18
; CHECK-NEXT: .seh_allocz 18
; CHECK-NEXT: add sp, sp, #16
@@ -1024,10 +1010,8 @@ define void @f6(<vscale x 2 x i64> %x, [8 x i64] %pad, i64 %n9) personality ptr
; CHECK-NEXT: .seh_save_fplr 16
; CHECK-NEXT: ldr x28, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: .seh_save_reg x28, 8
-; CHECK-NEXT: ldr x19, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x19, 0
-; CHECK-NEXT: add sp, sp, #32
-; CHECK-NEXT: .seh_stackalloc 32
+; CHECK-NEXT: ldr x19, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x19, 32
; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 2
; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1144,10 +1128,8 @@ define void @f8(<vscale x 2 x i64> %v) {
; CHECK-NEXT: //APP
; CHECK-NEXT: //NO_APP
; CHECK-NEXT: .seh_startepilogue
-; CHECK-NEXT: ldr x30, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x30, 0
-; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: .seh_stackalloc 16
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x30, 16
; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 0
; CHECK-NEXT: addvl sp, sp, #1
@@ -1196,14 +1178,10 @@ define void @f9(<vscale x 2 x i64> %v, ...) {
; CHECK-NEXT: //APP
; CHECK-NEXT: //NO_APP
; CHECK-NEXT: .seh_startepilogue
-; CHECK-NEXT: ldr x30, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x30, 0
-; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: .seh_stackalloc 16
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x30, 16
; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 0
-; CHECK-NEXT: add sp, sp, #64
-; CHECK-NEXT: .seh_stackalloc 64
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: .seh_allocz 1
; CHECK-NEXT: add sp, sp, #64
@@ -1301,10 +1279,8 @@ define void @f10(i64 %n, <vscale x 2 x i64> %x) "frame-pointer"="all" {
; CHECK-NEXT: .seh_stackalloc 16
; CHECK-NEXT: ldp x29, x30, [sp, #8] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_fplr 8
-; CHECK-NEXT: ldr x28, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x28, 0
-; CHECK-NEXT: add sp, sp, #32
-; CHECK-NEXT: .seh_stackalloc 32
+; CHECK-NEXT: ldr x28, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x28, 32
; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 2
; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1390,10 +1366,8 @@ define i32 @f11(double %d, <vscale x 4 x i32> %vs) "aarch64_pstate_sm_compatible
; CHECK-NEXT: //NO_APP
; CHECK-NEXT: str d0, [sp, #8]
; CHECK-NEXT: .seh_startepilogue
-; CHECK-NEXT: ldr x30, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x30, 0
-; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: .seh_stackalloc 16
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x30, 16
; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 0
; CHECK-NEXT: addvl sp, sp, #1
@@ -1431,10 +1405,8 @@ define i32 @f12(double %d, <vscale x 4 x i32> %vs) "aarch64_pstate_sm_compatible
; CHECK-NEXT: .seh_startepilogue
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: .seh_allocz 1
-; CHECK-NEXT: ldr x30, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x30, 0
-; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: .seh_stackalloc 16
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x30, 16
; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 0
; CHECK-NEXT: addvl sp, sp, #1
@@ -1475,10 +1447,8 @@ define i32 @f13(double %d, <vscale x 4 x i32> %vs) "frame-pointer"="all" {
; CHECK-NEXT: .seh_startepilogue
; CHECK-NEXT: ldp x29, x30, [sp, #8] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_fplr 8
-; CHECK-NEXT: ldr x28, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x28, 0
-; CHECK-NEXT: add sp, sp, #32
-; CHECK-NEXT: .seh_stackalloc 32
+; CHECK-NEXT: ldr x28, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x28, 32
; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 0
; CHECK-NEXT: addvl sp, sp, #1
@@ -1521,10 +1491,8 @@ define i32 @f14(double %d, <vscale x 4 x i32> %vs) "frame-pointer"="all" {
; CHECK-NEXT: .seh_allocz 1
; CHECK-NEXT: ldp x29, x30, [sp, #8] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_fplr 8
-; CHECK-NEXT: ldr x28, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x28, 0
-; CHECK-NEXT: add sp, sp, #32
-; CHECK-NEXT: .seh_stackalloc 32
+; CHECK-NEXT: ldr x28, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x28, 32
; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 0
; CHECK-NEXT: addvl sp, sp, #1
@@ -1572,10 +1540,8 @@ define tailcc void @f15(double %d, <vscale x 4 x i32> %vs, [9 x i64], i32 %i) {
; CHECK-NEXT: .seh_stackalloc 16
; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: .seh_save_reg x30, 8
-; CHECK-NEXT: ldr x28, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x28, 0
-; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: .seh_stackalloc 16
+; CHECK-NEXT: ldr x28, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x28, 16
; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 0
; CHECK-NEXT: addvl sp, sp, #1
@@ -1594,3 +1560,53 @@ define tailcc void @f15(double %d, <vscale x 4 x i32> %vs, [9 x i64], i32 %i) {
store i32 %i, ptr %a
ret void
}
+
+declare ptr @llvm.swift.async.context.addr()
+
+define void @f16(ptr swiftasync %ctx, <vscale x 2 x i64> %foo) {
+; CHECK-LABEL: f16:
+; CHECK: .seh_proc f16
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: orr x29, x29, #0x1000000000000000
+; CHECK-NEXT: .seh_nop
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: .seh_allocz 1
+; CHECK-NEXT: str z8, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: .seh_save_zreg z8, 0
+; CHECK-NEXT: sub sp, sp, #32
+; CHECK-NEXT: .seh_stackalloc 32
+; CHECK-NEXT: stp x29, x30, [sp, #8] // 16-byte Folded Spill
+; CHECK-NEXT: .seh_save_fplr 8
+; CHECK-NEXT: str x22, [sp]
+; CHECK-NEXT: .seh_nop
+; CHECK-NEXT: add x29, sp, #8
+; CHECK-NEXT: .seh_add_fp 8
+; CHECK-NEXT: .seh_endprologue
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: //APP
+; CHECK-NEXT: //NO_APP
+; CHECK-NEXT: ldr x8, [x22]
+; CHECK-NEXT: stur x8, [x29, #-8]
+; CHECK-NEXT: .seh_startepilogue
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: .seh_stackalloc 16
+; CHECK-NEXT: ldp x29, x30, [sp, #8] // 16-byte Folded Reload
+; CHECK-NEXT: .seh_save_fplr 8
+; CHECK-NEXT: add sp, sp, #32
+; CHECK-NEXT: .seh_stackalloc 32
+; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: .seh_save_zreg z8, 0
+; CHECK-NEXT: and x29, x29, #0xefffffffffffffff
+; CHECK-NEXT: .seh_nop
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: .seh_allocz 1
+; CHECK-NEXT: .seh_endepilogue
+; CHECK-NEXT: ret
+; CHECK-NEXT: .seh_endfunclet
+; CHECK-NEXT: .seh_endproc
+ tail call void asm sideeffect "", "~{z8}"()
+ %1 = load ptr, ptr %ctx, align 8
+ %2 = tail call ptr @llvm.swift.async.context.addr()
+ store ptr %1, ptr %2, align 8
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll
index 26b9d99..8705647 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll
@@ -206,7 +206,7 @@ define <2 x half> @test_max_K0min_K1Val_v2f16(<2 x half> %a) #1 {
; global nnan function attribute always forces clamp combine
-define float @test_min_max_global_nnan(float %a) #3 {
+define float @test_min_max_global_nnan(float %a) {
; GFX10-LABEL: test_min_max_global_nnan:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -223,11 +223,11 @@ define float @test_min_max_global_nnan(float %a) #3 {
; GFX12-NEXT: v_max_num_f32_e64 v0, v0, v0 clamp
; GFX12-NEXT: s_setpc_b64 s[30:31]
%maxnum = call float @llvm.maxnum.f32(float %a, float 0.0)
- %fmed = call float @llvm.minnum.f32(float %maxnum, float 1.0)
+ %fmed = call nnan float @llvm.minnum.f32(float %maxnum, float 1.0)
ret float %fmed
}
-define float @test_max_min_global_nnan(float %a) #3 {
+define float @test_max_min_global_nnan(float %a) {
; GFX10-LABEL: test_max_min_global_nnan:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -244,7 +244,7 @@ define float @test_max_min_global_nnan(float %a) #3 {
; GFX12-NEXT: v_max_num_f32_e64 v0, v0, v0 clamp
; GFX12-NEXT: s_setpc_b64 s[30:31]
%minnum = call float @llvm.minnum.f32(float %a, float 1.0)
- %fmed = call float @llvm.maxnum.f32(float %minnum, float 0.0)
+ %fmed = call nnan float @llvm.maxnum.f32(float %minnum, float 0.0)
ret float %fmed
}
@@ -414,5 +414,4 @@ declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>)
attributes #0 = {"amdgpu-ieee"="true"}
attributes #1 = {"amdgpu-ieee"="false"}
attributes #2 = {"amdgpu-ieee"="true" "amdgpu-dx10-clamp"="true"}
-attributes #3 = {"no-nans-fp-math"="true"}
attributes #4 = {"amdgpu-ieee"="true" "amdgpu-dx10-clamp"="false"}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll
index d2c93e7..696a87b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll
@@ -232,7 +232,7 @@ define half @test_max_K0min_K1Val_f16(half %a) #1 {
; global nnan function attribute always forces fmed3 combine
-define float @test_min_max_global_nnan(float %a) #2 {
+define float @test_min_max_global_nnan(float %a) {
; GFX10-LABEL: test_min_max_global_nnan:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -254,12 +254,12 @@ define float @test_min_max_global_nnan(float %a) #2 {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_med3_num_f32 v0, v0, 2.0, 4.0
; GFX12-NEXT: s_setpc_b64 s[30:31]
- %maxnum = call float @llvm.maxnum.f32(float %a, float 2.0)
+ %maxnum = call nnan float @llvm.maxnum.f32(float %a, float 2.0)
%fmed = call float @llvm.minnum.f32(float %maxnum, float 4.0)
ret float %fmed
}
-define float @test_max_min_global_nnan(float %a) #2 {
+define float @test_max_min_global_nnan(float %a) {
; GFX10-LABEL: test_max_min_global_nnan:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -281,8 +281,8 @@ define float @test_max_min_global_nnan(float %a) #2 {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_med3_num_f32 v0, v0, 2.0, 4.0
; GFX12-NEXT: s_setpc_b64 s[30:31]
- %minnum = call float @llvm.minnum.f32(float %a, float 4.0)
- %fmed = call float @llvm.maxnum.f32(float %minnum, float 2.0)
+ %minnum = call nnan float @llvm.minnum.f32(float %a, float 4.0)
+ %fmed = call nnan float @llvm.maxnum.f32(float %minnum, float 2.0)
ret float %fmed
}
@@ -560,4 +560,3 @@ declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>)
declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>)
attributes #0 = {"amdgpu-ieee"="true"}
attributes #1 = {"amdgpu-ieee"="false"}
-attributes #2 = {"no-nans-fp-math"="true"}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
index 549af87..a43bfb5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
@@ -1047,7 +1047,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cvt_f32_f16_e64 v1, |s1|
; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v1
; CI-NEXT: s_cbranch_vccz .LBB9_2
-; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: ; %bb.1: ; %frem.else20
; CI-NEXT: s_and_b32 s2, s0, 0x8000
; CI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v1
; CI-NEXT: v_mov_b32_e32 v0, s2
@@ -1058,7 +1058,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_xor_b32 s2, s2, 1
; CI-NEXT: s_cmp_lg_u32 s2, 0
; CI-NEXT: s_cbranch_scc1 .LBB9_8
-; CI-NEXT: ; %bb.3: ; %frem.compute
+; CI-NEXT: ; %bb.3: ; %frem.compute19
; CI-NEXT: v_frexp_mant_f32_e32 v3, v1
; CI-NEXT: v_frexp_exp_i32_f32_e32 v6, v1
; CI-NEXT: v_ldexp_f32_e64 v1, v3, 1
@@ -1083,10 +1083,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2
; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB9_6
-; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: ; %bb.4: ; %frem.loop_body27.preheader
; CI-NEXT: v_add_i32_e32 v2, vcc, 11, v5
; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6
-; CI-NEXT: .LBB9_5: ; %frem.loop_body
+; CI-NEXT: .LBB9_5: ; %frem.loop_body27
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v5, v4
; CI-NEXT: v_mul_f32_e32 v4, v5, v3
@@ -1102,7 +1102,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB9_7
; CI-NEXT: .LBB9_6:
; CI-NEXT: v_mov_b32_e32 v5, v4
-; CI-NEXT: .LBB9_7: ; %frem.loop_exit
+; CI-NEXT: .LBB9_7: ; %frem.loop_exit28
; CI-NEXT: v_add_i32_e32 v2, vcc, -10, v2
; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2
; CI-NEXT: v_mul_f32_e32 v3, v2, v3
@@ -1125,7 +1125,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: ; implicit-def: $vgpr1
; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2
; CI-NEXT: s_cbranch_vccz .LBB9_10
-; CI-NEXT: ; %bb.9: ; %frem.else20
+; CI-NEXT: ; %bb.9: ; %frem.else
; CI-NEXT: s_and_b32 s4, s2, 0x8000
; CI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2
; CI-NEXT: v_mov_b32_e32 v1, s4
@@ -1136,7 +1136,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_xor_b32 s4, s4, 1
; CI-NEXT: s_cmp_lg_u32 s4, 0
; CI-NEXT: s_cbranch_scc1 .LBB9_16
-; CI-NEXT: ; %bb.11: ; %frem.compute19
+; CI-NEXT: ; %bb.11: ; %frem.compute
; CI-NEXT: v_frexp_mant_f32_e32 v4, v2
; CI-NEXT: v_frexp_exp_i32_f32_e32 v7, v2
; CI-NEXT: v_ldexp_f32_e64 v2, v4, 1
@@ -1161,10 +1161,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v3
; CI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB9_14
-; CI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; CI-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; CI-NEXT: v_add_i32_e32 v3, vcc, 11, v6
; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v7
-; CI-NEXT: .LBB9_13: ; %frem.loop_body27
+; CI-NEXT: .LBB9_13: ; %frem.loop_body
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v6, v5
; CI-NEXT: v_mul_f32_e32 v5, v6, v4
@@ -1180,7 +1180,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB9_15
; CI-NEXT: .LBB9_14:
; CI-NEXT: v_mov_b32_e32 v6, v5
-; CI-NEXT: .LBB9_15: ; %frem.loop_exit28
+; CI-NEXT: .LBB9_15: ; %frem.loop_exit
; CI-NEXT: v_add_i32_e32 v3, vcc, -10, v3
; CI-NEXT: v_ldexp_f32_e32 v3, v6, v3
; CI-NEXT: v_mul_f32_e32 v4, v3, v4
@@ -1237,7 +1237,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cvt_f32_f16_e64 v1, |s1|
; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v1
; VI-NEXT: s_cbranch_vccz .LBB9_2
-; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: ; %bb.1: ; %frem.else20
; VI-NEXT: s_and_b32 s2, s0, 0x8000
; VI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v1
; VI-NEXT: v_mov_b32_e32 v0, s2
@@ -1248,7 +1248,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_xor_b32 s2, s2, 1
; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cbranch_scc1 .LBB9_8
-; VI-NEXT: ; %bb.3: ; %frem.compute
+; VI-NEXT: ; %bb.3: ; %frem.compute19
; VI-NEXT: v_frexp_mant_f32_e32 v3, v1
; VI-NEXT: v_frexp_exp_i32_f32_e32 v6, v1
; VI-NEXT: v_ldexp_f32 v1, v3, 1
@@ -1273,10 +1273,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2
; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB9_6
-; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: ; %bb.4: ; %frem.loop_body27.preheader
; VI-NEXT: v_add_u32_e32 v2, vcc, 11, v5
; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6
-; VI-NEXT: .LBB9_5: ; %frem.loop_body
+; VI-NEXT: .LBB9_5: ; %frem.loop_body27
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v5, v4
; VI-NEXT: v_mul_f32_e32 v4, v5, v3
@@ -1292,7 +1292,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB9_7
; VI-NEXT: .LBB9_6:
; VI-NEXT: v_mov_b32_e32 v5, v4
-; VI-NEXT: .LBB9_7: ; %frem.loop_exit
+; VI-NEXT: .LBB9_7: ; %frem.loop_exit28
; VI-NEXT: v_add_u32_e32 v2, vcc, -10, v2
; VI-NEXT: v_ldexp_f32 v2, v5, v2
; VI-NEXT: v_mul_f32_e32 v3, v2, v3
@@ -1315,7 +1315,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: ; implicit-def: $vgpr1
; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2
; VI-NEXT: s_cbranch_vccz .LBB9_10
-; VI-NEXT: ; %bb.9: ; %frem.else20
+; VI-NEXT: ; %bb.9: ; %frem.else
; VI-NEXT: s_and_b32 s3, s4, 0x8000
; VI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1326,7 +1326,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_xor_b32 s3, s3, 1
; VI-NEXT: s_cmp_lg_u32 s3, 0
; VI-NEXT: s_cbranch_scc1 .LBB9_16
-; VI-NEXT: ; %bb.11: ; %frem.compute19
+; VI-NEXT: ; %bb.11: ; %frem.compute
; VI-NEXT: v_frexp_mant_f32_e32 v4, v2
; VI-NEXT: v_frexp_exp_i32_f32_e32 v7, v2
; VI-NEXT: v_ldexp_f32 v2, v4, 1
@@ -1351,10 +1351,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v3
; VI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB9_14
-; VI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; VI-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; VI-NEXT: v_add_u32_e32 v3, vcc, 11, v6
; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v7
-; VI-NEXT: .LBB9_13: ; %frem.loop_body27
+; VI-NEXT: .LBB9_13: ; %frem.loop_body
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v6, v5
; VI-NEXT: v_mul_f32_e32 v5, v6, v4
@@ -1370,7 +1370,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB9_15
; VI-NEXT: .LBB9_14:
; VI-NEXT: v_mov_b32_e32 v6, v5
-; VI-NEXT: .LBB9_15: ; %frem.loop_exit28
+; VI-NEXT: .LBB9_15: ; %frem.loop_exit
; VI-NEXT: v_add_u32_e32 v3, vcc, -10, v3
; VI-NEXT: v_ldexp_f32 v3, v6, v3
; VI-NEXT: v_mul_f32_e32 v4, v3, v4
@@ -1425,7 +1425,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cvt_f32_f16_e64 v1, |s2|
; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v1
; CI-NEXT: s_cbranch_vccz .LBB10_2
-; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: ; %bb.1: ; %frem.else86
; CI-NEXT: s_and_b32 s0, s4, 0x8000
; CI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v1
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -1436,7 +1436,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_xor_b32 s0, s0, 1
; CI-NEXT: s_cmp_lg_u32 s0, 0
; CI-NEXT: s_cbranch_scc1 .LBB10_8
-; CI-NEXT: ; %bb.3: ; %frem.compute
+; CI-NEXT: ; %bb.3: ; %frem.compute85
; CI-NEXT: v_frexp_mant_f32_e32 v3, v1
; CI-NEXT: v_frexp_exp_i32_f32_e32 v6, v1
; CI-NEXT: v_ldexp_f32_e64 v1, v3, 1
@@ -1461,10 +1461,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2
; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB10_6
-; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: ; %bb.4: ; %frem.loop_body93.preheader
; CI-NEXT: v_add_i32_e32 v2, vcc, 11, v5
; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6
-; CI-NEXT: .LBB10_5: ; %frem.loop_body
+; CI-NEXT: .LBB10_5: ; %frem.loop_body93
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v5, v4
; CI-NEXT: v_mul_f32_e32 v4, v5, v3
@@ -1480,7 +1480,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB10_7
; CI-NEXT: .LBB10_6:
; CI-NEXT: v_mov_b32_e32 v5, v4
-; CI-NEXT: .LBB10_7: ; %frem.loop_exit
+; CI-NEXT: .LBB10_7: ; %frem.loop_exit94
; CI-NEXT: v_add_i32_e32 v2, vcc, -10, v2
; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2
; CI-NEXT: v_mul_f32_e32 v3, v2, v3
@@ -1503,7 +1503,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: ; implicit-def: $vgpr1
; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2
; CI-NEXT: s_cbranch_vccz .LBB10_10
-; CI-NEXT: ; %bb.9: ; %frem.else20
+; CI-NEXT: ; %bb.9: ; %frem.else53
; CI-NEXT: s_and_b32 s1, s6, 0x8000
; CI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -1514,7 +1514,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_xor_b32 s1, s1, 1
; CI-NEXT: s_cmp_lg_u32 s1, 0
; CI-NEXT: s_cbranch_scc1 .LBB10_16
-; CI-NEXT: ; %bb.11: ; %frem.compute19
+; CI-NEXT: ; %bb.11: ; %frem.compute52
; CI-NEXT: v_frexp_mant_f32_e32 v4, v2
; CI-NEXT: v_frexp_exp_i32_f32_e32 v7, v2
; CI-NEXT: v_ldexp_f32_e64 v2, v4, 1
@@ -1539,10 +1539,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v3
; CI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB10_14
-; CI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; CI-NEXT: ; %bb.12: ; %frem.loop_body60.preheader
; CI-NEXT: v_add_i32_e32 v3, vcc, 11, v6
; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v7
-; CI-NEXT: .LBB10_13: ; %frem.loop_body27
+; CI-NEXT: .LBB10_13: ; %frem.loop_body60
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v6, v5
; CI-NEXT: v_mul_f32_e32 v5, v6, v4
@@ -1558,7 +1558,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB10_15
; CI-NEXT: .LBB10_14:
; CI-NEXT: v_mov_b32_e32 v6, v5
-; CI-NEXT: .LBB10_15: ; %frem.loop_exit28
+; CI-NEXT: .LBB10_15: ; %frem.loop_exit61
; CI-NEXT: v_add_i32_e32 v3, vcc, -10, v3
; CI-NEXT: v_ldexp_f32_e32 v3, v6, v3
; CI-NEXT: v_mul_f32_e32 v4, v3, v4
@@ -1579,7 +1579,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: ; implicit-def: $vgpr2
; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v3
; CI-NEXT: s_cbranch_vccz .LBB10_18
-; CI-NEXT: ; %bb.17: ; %frem.else53
+; CI-NEXT: ; %bb.17: ; %frem.else20
; CI-NEXT: s_and_b32 s1, s5, 0x8000
; CI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v3
; CI-NEXT: v_mov_b32_e32 v2, s1
@@ -1590,7 +1590,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_xor_b32 s1, s1, 1
; CI-NEXT: s_cmp_lg_u32 s1, 0
; CI-NEXT: s_cbranch_scc1 .LBB10_24
-; CI-NEXT: ; %bb.19: ; %frem.compute52
+; CI-NEXT: ; %bb.19: ; %frem.compute19
; CI-NEXT: v_frexp_mant_f32_e32 v5, v3
; CI-NEXT: v_frexp_exp_i32_f32_e32 v8, v3
; CI-NEXT: v_ldexp_f32_e64 v3, v5, 1
@@ -1615,10 +1615,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v4
; CI-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB10_22
-; CI-NEXT: ; %bb.20: ; %frem.loop_body60.preheader
+; CI-NEXT: ; %bb.20: ; %frem.loop_body27.preheader
; CI-NEXT: v_add_i32_e32 v4, vcc, 11, v7
; CI-NEXT: v_sub_i32_e32 v4, vcc, v4, v8
-; CI-NEXT: .LBB10_21: ; %frem.loop_body60
+; CI-NEXT: .LBB10_21: ; %frem.loop_body27
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v7, v6
; CI-NEXT: v_mul_f32_e32 v6, v7, v5
@@ -1634,7 +1634,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB10_23
; CI-NEXT: .LBB10_22:
; CI-NEXT: v_mov_b32_e32 v7, v6
-; CI-NEXT: .LBB10_23: ; %frem.loop_exit61
+; CI-NEXT: .LBB10_23: ; %frem.loop_exit28
; CI-NEXT: v_add_i32_e32 v4, vcc, -10, v4
; CI-NEXT: v_ldexp_f32_e32 v4, v7, v4
; CI-NEXT: v_mul_f32_e32 v5, v4, v5
@@ -1657,7 +1657,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: ; implicit-def: $vgpr3
; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v5, v4
; CI-NEXT: s_cbranch_vccz .LBB10_26
-; CI-NEXT: ; %bb.25: ; %frem.else86
+; CI-NEXT: ; %bb.25: ; %frem.else
; CI-NEXT: s_and_b32 s1, s7, 0x8000
; CI-NEXT: v_cmp_eq_f32_e32 vcc, v5, v4
; CI-NEXT: v_mov_b32_e32 v3, s1
@@ -1668,7 +1668,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_xor_b32 s1, s1, 1
; CI-NEXT: s_cmp_lg_u32 s1, 0
; CI-NEXT: s_cbranch_scc1 .LBB10_32
-; CI-NEXT: ; %bb.27: ; %frem.compute85
+; CI-NEXT: ; %bb.27: ; %frem.compute
; CI-NEXT: v_frexp_mant_f32_e32 v6, v4
; CI-NEXT: v_frexp_exp_i32_f32_e32 v9, v4
; CI-NEXT: v_ldexp_f32_e64 v4, v6, 1
@@ -1693,10 +1693,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v5
; CI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB10_30
-; CI-NEXT: ; %bb.28: ; %frem.loop_body93.preheader
+; CI-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; CI-NEXT: v_add_i32_e32 v5, vcc, 11, v8
; CI-NEXT: v_sub_i32_e32 v5, vcc, v5, v9
-; CI-NEXT: .LBB10_29: ; %frem.loop_body93
+; CI-NEXT: .LBB10_29: ; %frem.loop_body
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v8, v7
; CI-NEXT: v_mul_f32_e32 v7, v8, v6
@@ -1712,7 +1712,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB10_31
; CI-NEXT: .LBB10_30:
; CI-NEXT: v_mov_b32_e32 v8, v7
-; CI-NEXT: .LBB10_31: ; %frem.loop_exit94
+; CI-NEXT: .LBB10_31: ; %frem.loop_exit
; CI-NEXT: v_add_i32_e32 v5, vcc, -10, v5
; CI-NEXT: v_ldexp_f32_e32 v5, v8, v5
; CI-NEXT: v_mul_f32_e32 v6, v5, v6
@@ -1791,7 +1791,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cvt_f32_f16_e64 v1, |s6|
; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v1
; VI-NEXT: s_cbranch_vccz .LBB10_2
-; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: ; %bb.1: ; %frem.else86
; VI-NEXT: s_and_b32 s0, s8, 0x8000
; VI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v1
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -1802,7 +1802,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_xor_b32 s0, s0, 1
; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_cbranch_scc1 .LBB10_8
-; VI-NEXT: ; %bb.3: ; %frem.compute
+; VI-NEXT: ; %bb.3: ; %frem.compute85
; VI-NEXT: v_frexp_mant_f32_e32 v3, v1
; VI-NEXT: v_frexp_exp_i32_f32_e32 v6, v1
; VI-NEXT: v_ldexp_f32 v1, v3, 1
@@ -1827,10 +1827,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2
; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB10_6
-; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: ; %bb.4: ; %frem.loop_body93.preheader
; VI-NEXT: v_add_u32_e32 v2, vcc, 11, v5
; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6
-; VI-NEXT: .LBB10_5: ; %frem.loop_body
+; VI-NEXT: .LBB10_5: ; %frem.loop_body93
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v5, v4
; VI-NEXT: v_mul_f32_e32 v4, v5, v3
@@ -1846,7 +1846,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB10_7
; VI-NEXT: .LBB10_6:
; VI-NEXT: v_mov_b32_e32 v5, v4
-; VI-NEXT: .LBB10_7: ; %frem.loop_exit
+; VI-NEXT: .LBB10_7: ; %frem.loop_exit94
; VI-NEXT: v_add_u32_e32 v2, vcc, -10, v2
; VI-NEXT: v_ldexp_f32 v2, v5, v2
; VI-NEXT: v_mul_f32_e32 v3, v2, v3
@@ -1869,7 +1869,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: ; implicit-def: $vgpr1
; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2
; VI-NEXT: s_cbranch_vccz .LBB10_10
-; VI-NEXT: ; %bb.9: ; %frem.else20
+; VI-NEXT: ; %bb.9: ; %frem.else53
; VI-NEXT: s_and_b32 s0, s4, 0x8000
; VI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2
; VI-NEXT: v_mov_b32_e32 v1, s0
@@ -1880,7 +1880,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_xor_b32 s0, s0, 1
; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_cbranch_scc1 .LBB10_16
-; VI-NEXT: ; %bb.11: ; %frem.compute19
+; VI-NEXT: ; %bb.11: ; %frem.compute52
; VI-NEXT: v_frexp_mant_f32_e32 v4, v2
; VI-NEXT: v_frexp_exp_i32_f32_e32 v7, v2
; VI-NEXT: v_ldexp_f32 v2, v4, 1
@@ -1905,10 +1905,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v3
; VI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB10_14
-; VI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; VI-NEXT: ; %bb.12: ; %frem.loop_body60.preheader
; VI-NEXT: v_add_u32_e32 v3, vcc, 11, v6
; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v7
-; VI-NEXT: .LBB10_13: ; %frem.loop_body27
+; VI-NEXT: .LBB10_13: ; %frem.loop_body60
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v6, v5
; VI-NEXT: v_mul_f32_e32 v5, v6, v4
@@ -1924,7 +1924,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB10_15
; VI-NEXT: .LBB10_14:
; VI-NEXT: v_mov_b32_e32 v6, v5
-; VI-NEXT: .LBB10_15: ; %frem.loop_exit28
+; VI-NEXT: .LBB10_15: ; %frem.loop_exit61
; VI-NEXT: v_add_u32_e32 v3, vcc, -10, v3
; VI-NEXT: v_ldexp_f32 v3, v6, v3
; VI-NEXT: v_mul_f32_e32 v4, v3, v4
@@ -1945,7 +1945,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: ; implicit-def: $vgpr2
; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v3
; VI-NEXT: s_cbranch_vccz .LBB10_18
-; VI-NEXT: ; %bb.17: ; %frem.else53
+; VI-NEXT: ; %bb.17: ; %frem.else20
; VI-NEXT: s_and_b32 s0, s9, 0x8000
; VI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v3
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -1956,7 +1956,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_xor_b32 s0, s0, 1
; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_cbranch_scc1 .LBB10_24
-; VI-NEXT: ; %bb.19: ; %frem.compute52
+; VI-NEXT: ; %bb.19: ; %frem.compute19
; VI-NEXT: v_frexp_mant_f32_e32 v5, v3
; VI-NEXT: v_frexp_exp_i32_f32_e32 v8, v3
; VI-NEXT: v_ldexp_f32 v3, v5, 1
@@ -1981,10 +1981,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v4
; VI-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB10_22
-; VI-NEXT: ; %bb.20: ; %frem.loop_body60.preheader
+; VI-NEXT: ; %bb.20: ; %frem.loop_body27.preheader
; VI-NEXT: v_add_u32_e32 v4, vcc, 11, v7
; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v8
-; VI-NEXT: .LBB10_21: ; %frem.loop_body60
+; VI-NEXT: .LBB10_21: ; %frem.loop_body27
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v7, v6
; VI-NEXT: v_mul_f32_e32 v6, v7, v5
@@ -2000,7 +2000,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB10_23
; VI-NEXT: .LBB10_22:
; VI-NEXT: v_mov_b32_e32 v7, v6
-; VI-NEXT: .LBB10_23: ; %frem.loop_exit61
+; VI-NEXT: .LBB10_23: ; %frem.loop_exit28
; VI-NEXT: v_add_u32_e32 v4, vcc, -10, v4
; VI-NEXT: v_ldexp_f32 v4, v7, v4
; VI-NEXT: v_mul_f32_e32 v5, v4, v5
@@ -2023,7 +2023,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: ; implicit-def: $vgpr3
; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v5, v4
; VI-NEXT: s_cbranch_vccz .LBB10_26
-; VI-NEXT: ; %bb.25: ; %frem.else86
+; VI-NEXT: ; %bb.25: ; %frem.else
; VI-NEXT: s_and_b32 s0, s12, 0x8000
; VI-NEXT: v_cmp_eq_f32_e32 vcc, v5, v4
; VI-NEXT: v_mov_b32_e32 v3, s0
@@ -2034,7 +2034,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_xor_b32 s0, s0, 1
; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_cbranch_scc1 .LBB10_32
-; VI-NEXT: ; %bb.27: ; %frem.compute85
+; VI-NEXT: ; %bb.27: ; %frem.compute
; VI-NEXT: v_frexp_mant_f32_e32 v6, v4
; VI-NEXT: v_frexp_exp_i32_f32_e32 v9, v4
; VI-NEXT: v_ldexp_f32 v4, v6, 1
@@ -2059,10 +2059,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v5
; VI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB10_30
-; VI-NEXT: ; %bb.28: ; %frem.loop_body93.preheader
+; VI-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; VI-NEXT: v_add_u32_e32 v5, vcc, 11, v8
; VI-NEXT: v_sub_u32_e32 v5, vcc, v5, v9
-; VI-NEXT: .LBB10_29: ; %frem.loop_body93
+; VI-NEXT: .LBB10_29: ; %frem.loop_body
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v8, v7
; VI-NEXT: v_mul_f32_e32 v7, v8, v6
@@ -2078,7 +2078,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB10_31
; VI-NEXT: .LBB10_30:
; VI-NEXT: v_mov_b32_e32 v8, v7
-; VI-NEXT: .LBB10_31: ; %frem.loop_exit94
+; VI-NEXT: .LBB10_31: ; %frem.loop_exit
; VI-NEXT: v_add_u32_e32 v5, vcc, -10, v5
; VI-NEXT: v_ldexp_f32 v5, v8, v5
; VI-NEXT: v_mul_f32_e32 v6, v5, v6
@@ -2144,7 +2144,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ngt_f32_e64 vcc, |s2|, |v0|
; CI-NEXT: ; implicit-def: $vgpr0
; CI-NEXT: s_cbranch_vccz .LBB11_2
-; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: ; %bb.1: ; %frem.else16
; CI-NEXT: s_and_b32 s6, s2, 0x80000000
; CI-NEXT: v_mov_b32_e32 v1, s4
; CI-NEXT: v_mov_b32_e32 v0, s2
@@ -2156,7 +2156,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_xor_b32 s6, s6, 1
; CI-NEXT: s_cmp_lg_u32 s6, 0
; CI-NEXT: s_cbranch_scc1 .LBB11_8
-; CI-NEXT: ; %bb.3: ; %frem.compute
+; CI-NEXT: ; %bb.3: ; %frem.compute15
; CI-NEXT: v_frexp_mant_f32_e64 v1, |s4|
; CI-NEXT: v_ldexp_f32_e64 v1, v1, 1
; CI-NEXT: v_div_scale_f32 v3, s[6:7], v1, v1, 1.0
@@ -2181,10 +2181,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2
; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB11_6
-; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; CI-NEXT: v_add_i32_e32 v2, vcc, 12, v5
; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6
-; CI-NEXT: .LBB11_5: ; %frem.loop_body
+; CI-NEXT: .LBB11_5: ; %frem.loop_body23
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v5, v4
; CI-NEXT: v_mul_f32_e32 v4, v5, v3
@@ -2200,7 +2200,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB11_7
; CI-NEXT: .LBB11_6:
; CI-NEXT: v_mov_b32_e32 v5, v4
-; CI-NEXT: .LBB11_7: ; %frem.loop_exit
+; CI-NEXT: .LBB11_7: ; %frem.loop_exit24
; CI-NEXT: v_add_i32_e32 v2, vcc, -11, v2
; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2
; CI-NEXT: v_mul_f32_e32 v3, v2, v3
@@ -2219,7 +2219,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_mov_b32 s6, 1
; CI-NEXT: ; implicit-def: $vgpr1
; CI-NEXT: s_cbranch_vccz .LBB11_10
-; CI-NEXT: ; %bb.9: ; %frem.else16
+; CI-NEXT: ; %bb.9: ; %frem.else
; CI-NEXT: s_and_b32 s6, s3, 0x80000000
; CI-NEXT: v_mov_b32_e32 v2, s5
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -2231,7 +2231,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_xor_b32 s6, s6, 1
; CI-NEXT: s_cmp_lg_u32 s6, 0
; CI-NEXT: s_cbranch_scc1 .LBB11_16
-; CI-NEXT: ; %bb.11: ; %frem.compute15
+; CI-NEXT: ; %bb.11: ; %frem.compute
; CI-NEXT: v_frexp_mant_f32_e64 v2, |s5|
; CI-NEXT: v_ldexp_f32_e64 v2, v2, 1
; CI-NEXT: v_div_scale_f32 v4, s[6:7], v2, v2, 1.0
@@ -2256,10 +2256,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v3
; CI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB11_14
-; CI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; CI-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; CI-NEXT: v_add_i32_e32 v3, vcc, 12, v6
; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v7
-; CI-NEXT: .LBB11_13: ; %frem.loop_body23
+; CI-NEXT: .LBB11_13: ; %frem.loop_body
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v6, v5
; CI-NEXT: v_mul_f32_e32 v5, v6, v4
@@ -2275,7 +2275,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB11_15
; CI-NEXT: .LBB11_14:
; CI-NEXT: v_mov_b32_e32 v6, v5
-; CI-NEXT: .LBB11_15: ; %frem.loop_exit24
+; CI-NEXT: .LBB11_15: ; %frem.loop_exit
; CI-NEXT: v_add_i32_e32 v3, vcc, -11, v3
; CI-NEXT: v_ldexp_f32_e32 v3, v6, v3
; CI-NEXT: v_mul_f32_e32 v4, v3, v4
@@ -2317,7 +2317,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ngt_f32_e64 vcc, |s2|, |v0|
; VI-NEXT: ; implicit-def: $vgpr0
; VI-NEXT: s_cbranch_vccz .LBB11_2
-; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: ; %bb.1: ; %frem.else16
; VI-NEXT: s_and_b32 s6, s2, 0x80000000
; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: v_mov_b32_e32 v0, s2
@@ -2329,7 +2329,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_xor_b32 s6, s6, 1
; VI-NEXT: s_cmp_lg_u32 s6, 0
; VI-NEXT: s_cbranch_scc1 .LBB11_8
-; VI-NEXT: ; %bb.3: ; %frem.compute
+; VI-NEXT: ; %bb.3: ; %frem.compute15
; VI-NEXT: v_frexp_mant_f32_e64 v1, |s4|
; VI-NEXT: v_ldexp_f32 v1, v1, 1
; VI-NEXT: v_div_scale_f32 v3, s[6:7], v1, v1, 1.0
@@ -2354,10 +2354,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2
; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB11_6
-; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v5
; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6
-; VI-NEXT: .LBB11_5: ; %frem.loop_body
+; VI-NEXT: .LBB11_5: ; %frem.loop_body23
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v5, v4
; VI-NEXT: v_mul_f32_e32 v4, v5, v3
@@ -2373,7 +2373,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB11_7
; VI-NEXT: .LBB11_6:
; VI-NEXT: v_mov_b32_e32 v5, v4
-; VI-NEXT: .LBB11_7: ; %frem.loop_exit
+; VI-NEXT: .LBB11_7: ; %frem.loop_exit24
; VI-NEXT: v_add_u32_e32 v2, vcc, -11, v2
; VI-NEXT: v_ldexp_f32 v2, v5, v2
; VI-NEXT: v_mul_f32_e32 v3, v2, v3
@@ -2392,7 +2392,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_mov_b32 s6, 1
; VI-NEXT: ; implicit-def: $vgpr1
; VI-NEXT: s_cbranch_vccz .LBB11_10
-; VI-NEXT: ; %bb.9: ; %frem.else16
+; VI-NEXT: ; %bb.9: ; %frem.else
; VI-NEXT: s_and_b32 s6, s3, 0x80000000
; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -2404,7 +2404,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_xor_b32 s6, s6, 1
; VI-NEXT: s_cmp_lg_u32 s6, 0
; VI-NEXT: s_cbranch_scc1 .LBB11_16
-; VI-NEXT: ; %bb.11: ; %frem.compute15
+; VI-NEXT: ; %bb.11: ; %frem.compute
; VI-NEXT: v_frexp_mant_f32_e64 v2, |s5|
; VI-NEXT: v_ldexp_f32 v2, v2, 1
; VI-NEXT: v_div_scale_f32 v4, s[6:7], v2, v2, 1.0
@@ -2429,10 +2429,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v3
; VI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB11_14
-; VI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; VI-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; VI-NEXT: v_add_u32_e32 v3, vcc, 12, v6
; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v7
-; VI-NEXT: .LBB11_13: ; %frem.loop_body23
+; VI-NEXT: .LBB11_13: ; %frem.loop_body
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v6, v5
; VI-NEXT: v_mul_f32_e32 v5, v6, v4
@@ -2448,7 +2448,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB11_15
; VI-NEXT: .LBB11_14:
; VI-NEXT: v_mov_b32_e32 v6, v5
-; VI-NEXT: .LBB11_15: ; %frem.loop_exit24
+; VI-NEXT: .LBB11_15: ; %frem.loop_exit
; VI-NEXT: v_add_u32_e32 v3, vcc, -11, v3
; VI-NEXT: v_ldexp_f32 v3, v6, v3
; VI-NEXT: v_mul_f32_e32 v4, v3, v4
@@ -2498,7 +2498,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ngt_f32_e64 vcc, |s4|, |v0|
; CI-NEXT: ; implicit-def: $vgpr0
; CI-NEXT: s_cbranch_vccz .LBB12_2
-; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: ; %bb.1: ; %frem.else78
; CI-NEXT: s_and_b32 s2, s4, 0x80000000
; CI-NEXT: v_mov_b32_e32 v1, s8
; CI-NEXT: v_mov_b32_e32 v0, s4
@@ -2510,7 +2510,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_xor_b32 s2, s2, 1
; CI-NEXT: s_cmp_lg_u32 s2, 0
; CI-NEXT: s_cbranch_scc1 .LBB12_8
-; CI-NEXT: ; %bb.3: ; %frem.compute
+; CI-NEXT: ; %bb.3: ; %frem.compute77
; CI-NEXT: v_frexp_mant_f32_e64 v1, |s8|
; CI-NEXT: v_ldexp_f32_e64 v1, v1, 1
; CI-NEXT: v_div_scale_f32 v3, s[2:3], v1, v1, 1.0
@@ -2535,10 +2535,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2
; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB12_6
-; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: ; %bb.4: ; %frem.loop_body85.preheader
; CI-NEXT: v_add_i32_e32 v2, vcc, 12, v5
; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6
-; CI-NEXT: .LBB12_5: ; %frem.loop_body
+; CI-NEXT: .LBB12_5: ; %frem.loop_body85
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v5, v4
; CI-NEXT: v_mul_f32_e32 v4, v5, v3
@@ -2554,7 +2554,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB12_7
; CI-NEXT: .LBB12_6:
; CI-NEXT: v_mov_b32_e32 v5, v4
-; CI-NEXT: .LBB12_7: ; %frem.loop_exit
+; CI-NEXT: .LBB12_7: ; %frem.loop_exit86
; CI-NEXT: v_add_i32_e32 v2, vcc, -11, v2
; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2
; CI-NEXT: v_mul_f32_e32 v3, v2, v3
@@ -2573,7 +2573,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_mov_b32 s2, 1
; CI-NEXT: ; implicit-def: $vgpr1
; CI-NEXT: s_cbranch_vccz .LBB12_10
-; CI-NEXT: ; %bb.9: ; %frem.else16
+; CI-NEXT: ; %bb.9: ; %frem.else47
; CI-NEXT: s_and_b32 s2, s5, 0x80000000
; CI-NEXT: v_mov_b32_e32 v2, s9
; CI-NEXT: v_mov_b32_e32 v1, s5
@@ -2585,7 +2585,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_xor_b32 s2, s2, 1
; CI-NEXT: s_cmp_lg_u32 s2, 0
; CI-NEXT: s_cbranch_scc1 .LBB12_16
-; CI-NEXT: ; %bb.11: ; %frem.compute15
+; CI-NEXT: ; %bb.11: ; %frem.compute46
; CI-NEXT: v_frexp_mant_f32_e64 v2, |s9|
; CI-NEXT: v_ldexp_f32_e64 v2, v2, 1
; CI-NEXT: v_div_scale_f32 v4, s[2:3], v2, v2, 1.0
@@ -2610,10 +2610,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v3
; CI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB12_14
-; CI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; CI-NEXT: ; %bb.12: ; %frem.loop_body54.preheader
; CI-NEXT: v_add_i32_e32 v3, vcc, 12, v6
; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v7
-; CI-NEXT: .LBB12_13: ; %frem.loop_body23
+; CI-NEXT: .LBB12_13: ; %frem.loop_body54
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v6, v5
; CI-NEXT: v_mul_f32_e32 v5, v6, v4
@@ -2629,7 +2629,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB12_15
; CI-NEXT: .LBB12_14:
; CI-NEXT: v_mov_b32_e32 v6, v5
-; CI-NEXT: .LBB12_15: ; %frem.loop_exit24
+; CI-NEXT: .LBB12_15: ; %frem.loop_exit55
; CI-NEXT: v_add_i32_e32 v3, vcc, -11, v3
; CI-NEXT: v_ldexp_f32_e32 v3, v6, v3
; CI-NEXT: v_mul_f32_e32 v4, v3, v4
@@ -2648,7 +2648,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_mov_b32 s2, 1
; CI-NEXT: ; implicit-def: $vgpr2
; CI-NEXT: s_cbranch_vccz .LBB12_18
-; CI-NEXT: ; %bb.17: ; %frem.else47
+; CI-NEXT: ; %bb.17: ; %frem.else16
; CI-NEXT: s_and_b32 s2, s6, 0x80000000
; CI-NEXT: v_mov_b32_e32 v3, s10
; CI-NEXT: v_mov_b32_e32 v2, s6
@@ -2660,7 +2660,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_xor_b32 s2, s2, 1
; CI-NEXT: s_cmp_lg_u32 s2, 0
; CI-NEXT: s_cbranch_scc1 .LBB12_24
-; CI-NEXT: ; %bb.19: ; %frem.compute46
+; CI-NEXT: ; %bb.19: ; %frem.compute15
; CI-NEXT: v_frexp_mant_f32_e64 v3, |s10|
; CI-NEXT: v_ldexp_f32_e64 v3, v3, 1
; CI-NEXT: v_div_scale_f32 v5, s[2:3], v3, v3, 1.0
@@ -2685,10 +2685,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v4
; CI-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB12_22
-; CI-NEXT: ; %bb.20: ; %frem.loop_body54.preheader
+; CI-NEXT: ; %bb.20: ; %frem.loop_body23.preheader
; CI-NEXT: v_add_i32_e32 v4, vcc, 12, v7
; CI-NEXT: v_sub_i32_e32 v4, vcc, v4, v8
-; CI-NEXT: .LBB12_21: ; %frem.loop_body54
+; CI-NEXT: .LBB12_21: ; %frem.loop_body23
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v7, v6
; CI-NEXT: v_mul_f32_e32 v6, v7, v5
@@ -2704,7 +2704,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB12_23
; CI-NEXT: .LBB12_22:
; CI-NEXT: v_mov_b32_e32 v7, v6
-; CI-NEXT: .LBB12_23: ; %frem.loop_exit55
+; CI-NEXT: .LBB12_23: ; %frem.loop_exit24
; CI-NEXT: v_add_i32_e32 v4, vcc, -11, v4
; CI-NEXT: v_ldexp_f32_e32 v4, v7, v4
; CI-NEXT: v_mul_f32_e32 v5, v4, v5
@@ -2723,7 +2723,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_mov_b32 s2, 1
; CI-NEXT: ; implicit-def: $vgpr3
; CI-NEXT: s_cbranch_vccz .LBB12_26
-; CI-NEXT: ; %bb.25: ; %frem.else78
+; CI-NEXT: ; %bb.25: ; %frem.else
; CI-NEXT: s_and_b32 s2, s7, 0x80000000
; CI-NEXT: v_mov_b32_e32 v4, s11
; CI-NEXT: v_mov_b32_e32 v3, s7
@@ -2735,7 +2735,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_xor_b32 s2, s2, 1
; CI-NEXT: s_cmp_lg_u32 s2, 0
; CI-NEXT: s_cbranch_scc1 .LBB12_32
-; CI-NEXT: ; %bb.27: ; %frem.compute77
+; CI-NEXT: ; %bb.27: ; %frem.compute
; CI-NEXT: v_frexp_mant_f32_e64 v4, |s11|
; CI-NEXT: v_ldexp_f32_e64 v4, v4, 1
; CI-NEXT: v_div_scale_f32 v6, s[2:3], v4, v4, 1.0
@@ -2760,10 +2760,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v5
; CI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB12_30
-; CI-NEXT: ; %bb.28: ; %frem.loop_body85.preheader
+; CI-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; CI-NEXT: v_add_i32_e32 v5, vcc, 12, v8
; CI-NEXT: v_sub_i32_e32 v5, vcc, v5, v9
-; CI-NEXT: .LBB12_29: ; %frem.loop_body85
+; CI-NEXT: .LBB12_29: ; %frem.loop_body
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v8, v7
; CI-NEXT: v_mul_f32_e32 v7, v8, v6
@@ -2779,7 +2779,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB12_31
; CI-NEXT: .LBB12_30:
; CI-NEXT: v_mov_b32_e32 v8, v7
-; CI-NEXT: .LBB12_31: ; %frem.loop_exit86
+; CI-NEXT: .LBB12_31: ; %frem.loop_exit
; CI-NEXT: v_add_i32_e32 v5, vcc, -11, v5
; CI-NEXT: v_ldexp_f32_e32 v5, v8, v5
; CI-NEXT: v_mul_f32_e32 v6, v5, v6
@@ -2829,7 +2829,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ngt_f32_e64 vcc, |s4|, |v0|
; VI-NEXT: ; implicit-def: $vgpr0
; VI-NEXT: s_cbranch_vccz .LBB12_2
-; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: ; %bb.1: ; %frem.else78
; VI-NEXT: s_and_b32 s2, s4, 0x80000000
; VI-NEXT: v_mov_b32_e32 v1, s8
; VI-NEXT: v_mov_b32_e32 v0, s4
@@ -2841,7 +2841,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_xor_b32 s2, s2, 1
; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cbranch_scc1 .LBB12_8
-; VI-NEXT: ; %bb.3: ; %frem.compute
+; VI-NEXT: ; %bb.3: ; %frem.compute77
; VI-NEXT: v_frexp_mant_f32_e64 v1, |s8|
; VI-NEXT: v_ldexp_f32 v1, v1, 1
; VI-NEXT: v_div_scale_f32 v3, s[2:3], v1, v1, 1.0
@@ -2866,10 +2866,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2
; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB12_6
-; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: ; %bb.4: ; %frem.loop_body85.preheader
; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v5
; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6
-; VI-NEXT: .LBB12_5: ; %frem.loop_body
+; VI-NEXT: .LBB12_5: ; %frem.loop_body85
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v5, v4
; VI-NEXT: v_mul_f32_e32 v4, v5, v3
@@ -2885,7 +2885,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB12_7
; VI-NEXT: .LBB12_6:
; VI-NEXT: v_mov_b32_e32 v5, v4
-; VI-NEXT: .LBB12_7: ; %frem.loop_exit
+; VI-NEXT: .LBB12_7: ; %frem.loop_exit86
; VI-NEXT: v_add_u32_e32 v2, vcc, -11, v2
; VI-NEXT: v_ldexp_f32 v2, v5, v2
; VI-NEXT: v_mul_f32_e32 v3, v2, v3
@@ -2904,7 +2904,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_mov_b32 s2, 1
; VI-NEXT: ; implicit-def: $vgpr1
; VI-NEXT: s_cbranch_vccz .LBB12_10
-; VI-NEXT: ; %bb.9: ; %frem.else16
+; VI-NEXT: ; %bb.9: ; %frem.else47
; VI-NEXT: s_and_b32 s2, s5, 0x80000000
; VI-NEXT: v_mov_b32_e32 v2, s9
; VI-NEXT: v_mov_b32_e32 v1, s5
@@ -2916,7 +2916,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_xor_b32 s2, s2, 1
; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cbranch_scc1 .LBB12_16
-; VI-NEXT: ; %bb.11: ; %frem.compute15
+; VI-NEXT: ; %bb.11: ; %frem.compute46
; VI-NEXT: v_frexp_mant_f32_e64 v2, |s9|
; VI-NEXT: v_ldexp_f32 v2, v2, 1
; VI-NEXT: v_div_scale_f32 v4, s[2:3], v2, v2, 1.0
@@ -2941,10 +2941,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v3
; VI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB12_14
-; VI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; VI-NEXT: ; %bb.12: ; %frem.loop_body54.preheader
; VI-NEXT: v_add_u32_e32 v3, vcc, 12, v6
; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v7
-; VI-NEXT: .LBB12_13: ; %frem.loop_body23
+; VI-NEXT: .LBB12_13: ; %frem.loop_body54
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v6, v5
; VI-NEXT: v_mul_f32_e32 v5, v6, v4
@@ -2960,7 +2960,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB12_15
; VI-NEXT: .LBB12_14:
; VI-NEXT: v_mov_b32_e32 v6, v5
-; VI-NEXT: .LBB12_15: ; %frem.loop_exit24
+; VI-NEXT: .LBB12_15: ; %frem.loop_exit55
; VI-NEXT: v_add_u32_e32 v3, vcc, -11, v3
; VI-NEXT: v_ldexp_f32 v3, v6, v3
; VI-NEXT: v_mul_f32_e32 v4, v3, v4
@@ -2979,7 +2979,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_mov_b32 s2, 1
; VI-NEXT: ; implicit-def: $vgpr2
; VI-NEXT: s_cbranch_vccz .LBB12_18
-; VI-NEXT: ; %bb.17: ; %frem.else47
+; VI-NEXT: ; %bb.17: ; %frem.else16
; VI-NEXT: s_and_b32 s2, s6, 0x80000000
; VI-NEXT: v_mov_b32_e32 v3, s10
; VI-NEXT: v_mov_b32_e32 v2, s6
@@ -2991,7 +2991,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_xor_b32 s2, s2, 1
; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cbranch_scc1 .LBB12_24
-; VI-NEXT: ; %bb.19: ; %frem.compute46
+; VI-NEXT: ; %bb.19: ; %frem.compute15
; VI-NEXT: v_frexp_mant_f32_e64 v3, |s10|
; VI-NEXT: v_ldexp_f32 v3, v3, 1
; VI-NEXT: v_div_scale_f32 v5, s[2:3], v3, v3, 1.0
@@ -3016,10 +3016,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v4
; VI-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB12_22
-; VI-NEXT: ; %bb.20: ; %frem.loop_body54.preheader
+; VI-NEXT: ; %bb.20: ; %frem.loop_body23.preheader
; VI-NEXT: v_add_u32_e32 v4, vcc, 12, v7
; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v8
-; VI-NEXT: .LBB12_21: ; %frem.loop_body54
+; VI-NEXT: .LBB12_21: ; %frem.loop_body23
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v7, v6
; VI-NEXT: v_mul_f32_e32 v6, v7, v5
@@ -3035,7 +3035,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB12_23
; VI-NEXT: .LBB12_22:
; VI-NEXT: v_mov_b32_e32 v7, v6
-; VI-NEXT: .LBB12_23: ; %frem.loop_exit55
+; VI-NEXT: .LBB12_23: ; %frem.loop_exit24
; VI-NEXT: v_add_u32_e32 v4, vcc, -11, v4
; VI-NEXT: v_ldexp_f32 v4, v7, v4
; VI-NEXT: v_mul_f32_e32 v5, v4, v5
@@ -3054,7 +3054,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_mov_b32 s2, 1
; VI-NEXT: ; implicit-def: $vgpr3
; VI-NEXT: s_cbranch_vccz .LBB12_26
-; VI-NEXT: ; %bb.25: ; %frem.else78
+; VI-NEXT: ; %bb.25: ; %frem.else
; VI-NEXT: s_and_b32 s2, s7, 0x80000000
; VI-NEXT: v_mov_b32_e32 v4, s11
; VI-NEXT: v_mov_b32_e32 v3, s7
@@ -3066,7 +3066,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_xor_b32 s2, s2, 1
; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cbranch_scc1 .LBB12_32
-; VI-NEXT: ; %bb.27: ; %frem.compute77
+; VI-NEXT: ; %bb.27: ; %frem.compute
; VI-NEXT: v_frexp_mant_f32_e64 v4, |s11|
; VI-NEXT: v_ldexp_f32 v4, v4, 1
; VI-NEXT: v_div_scale_f32 v6, s[2:3], v4, v4, 1.0
@@ -3091,10 +3091,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v5
; VI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB12_30
-; VI-NEXT: ; %bb.28: ; %frem.loop_body85.preheader
+; VI-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; VI-NEXT: v_add_u32_e32 v5, vcc, 12, v8
; VI-NEXT: v_sub_u32_e32 v5, vcc, v5, v9
-; VI-NEXT: .LBB12_29: ; %frem.loop_body85
+; VI-NEXT: .LBB12_29: ; %frem.loop_body
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v8, v7
; VI-NEXT: v_mul_f32_e32 v7, v8, v6
@@ -3110,7 +3110,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB12_31
; VI-NEXT: .LBB12_30:
; VI-NEXT: v_mov_b32_e32 v8, v7
-; VI-NEXT: .LBB12_31: ; %frem.loop_exit86
+; VI-NEXT: .LBB12_31: ; %frem.loop_exit
; VI-NEXT: v_add_u32_e32 v5, vcc, -11, v5
; VI-NEXT: v_ldexp_f32 v5, v8, v5
; VI-NEXT: v_mul_f32_e32 v6, v5, v6
@@ -3169,7 +3169,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ngt_f64_e64 vcc, |s[4:5]|, |v[0:1]|
; CI-NEXT: ; implicit-def: $vgpr0_vgpr1
; CI-NEXT: s_cbranch_vccz .LBB13_2
-; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: ; %bb.1: ; %frem.else16
; CI-NEXT: v_mov_b32_e32 v0, s8
; CI-NEXT: v_mov_b32_e32 v1, s9
; CI-NEXT: v_cmp_eq_f64_e64 vcc, |s[4:5]|, |v[0:1]|
@@ -3187,7 +3187,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_xor_b32 s2, s2, 1
; CI-NEXT: s_cmp_lg_u32 s2, 0
; CI-NEXT: s_cbranch_scc1 .LBB13_8
-; CI-NEXT: ; %bb.3: ; %frem.compute
+; CI-NEXT: ; %bb.3: ; %frem.compute15
; CI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[4:5]|
; CI-NEXT: v_frexp_exp_i32_f64_e64 v6, |s[4:5]|
; CI-NEXT: v_frexp_exp_i32_f64_e64 v7, |s[8:9]|
@@ -3210,10 +3210,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v9
; CI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], 1.0
; CI-NEXT: s_cbranch_vccnz .LBB13_6
-; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; CI-NEXT: v_add_i32_e32 v6, vcc, 26, v6
; CI-NEXT: v_sub_i32_e32 v9, vcc, v6, v7
-; CI-NEXT: .LBB13_5: ; %frem.loop_body
+; CI-NEXT: .LBB13_5: ; %frem.loop_body23
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v7, v5
; CI-NEXT: v_mov_b32_e32 v6, v4
@@ -3232,7 +3232,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: .LBB13_6:
; CI-NEXT: v_mov_b32_e32 v7, v5
; CI-NEXT: v_mov_b32_e32 v6, v4
-; CI-NEXT: .LBB13_7: ; %frem.loop_exit
+; CI-NEXT: .LBB13_7: ; %frem.loop_exit24
; CI-NEXT: v_add_i32_e32 v4, vcc, 0xffffffe7, v9
; CI-NEXT: v_ldexp_f64 v[4:5], v[6:7], v4
; CI-NEXT: s_mov_b32 s2, 0
@@ -3256,7 +3256,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_mov_b32 s2, 1
; CI-NEXT: ; implicit-def: $vgpr2_vgpr3
; CI-NEXT: s_cbranch_vccz .LBB13_10
-; CI-NEXT: ; %bb.9: ; %frem.else16
+; CI-NEXT: ; %bb.9: ; %frem.else
; CI-NEXT: v_mov_b32_e32 v2, s10
; CI-NEXT: v_mov_b32_e32 v3, s11
; CI-NEXT: v_cmp_eq_f64_e64 vcc, |s[6:7]|, |v[2:3]|
@@ -3274,7 +3274,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_xor_b32 s2, s2, 1
; CI-NEXT: s_cmp_lg_u32 s2, 0
; CI-NEXT: s_cbranch_scc1 .LBB13_16
-; CI-NEXT: ; %bb.11: ; %frem.compute15
+; CI-NEXT: ; %bb.11: ; %frem.compute
; CI-NEXT: v_frexp_mant_f64_e64 v[2:3], |s[6:7]|
; CI-NEXT: v_frexp_exp_i32_f64_e64 v8, |s[6:7]|
; CI-NEXT: v_frexp_exp_i32_f64_e64 v9, |s[10:11]|
@@ -3297,10 +3297,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v11
; CI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], 1.0
; CI-NEXT: s_cbranch_vccnz .LBB13_14
-; CI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; CI-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; CI-NEXT: v_add_i32_e32 v8, vcc, 26, v8
; CI-NEXT: v_sub_i32_e32 v11, vcc, v8, v9
-; CI-NEXT: .LBB13_13: ; %frem.loop_body23
+; CI-NEXT: .LBB13_13: ; %frem.loop_body
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v9, v7
; CI-NEXT: v_mov_b32_e32 v8, v6
@@ -3319,7 +3319,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: .LBB13_14:
; CI-NEXT: v_mov_b32_e32 v9, v7
; CI-NEXT: v_mov_b32_e32 v8, v6
-; CI-NEXT: .LBB13_15: ; %frem.loop_exit24
+; CI-NEXT: .LBB13_15: ; %frem.loop_exit
; CI-NEXT: v_add_i32_e32 v6, vcc, 0xffffffe7, v11
; CI-NEXT: v_ldexp_f64 v[6:7], v[8:9], v6
; CI-NEXT: s_mov_b32 s2, 0
@@ -3371,7 +3371,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ngt_f64_e64 vcc, |s[4:5]|, |v[0:1]|
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1
; VI-NEXT: s_cbranch_vccz .LBB13_2
-; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: ; %bb.1: ; %frem.else16
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_cmp_eq_f64_e64 vcc, |s[4:5]|, |v[0:1]|
@@ -3389,7 +3389,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_xor_b32 s2, s2, 1
; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cbranch_scc1 .LBB13_8
-; VI-NEXT: ; %bb.3: ; %frem.compute
+; VI-NEXT: ; %bb.3: ; %frem.compute15
; VI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[4:5]|
; VI-NEXT: v_frexp_exp_i32_f64_e64 v6, |s[4:5]|
; VI-NEXT: v_frexp_exp_i32_f64_e64 v7, |s[8:9]|
@@ -3412,10 +3412,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v9
; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], 1.0
; VI-NEXT: s_cbranch_vccnz .LBB13_6
-; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; VI-NEXT: v_add_u32_e32 v6, vcc, 26, v6
; VI-NEXT: v_sub_u32_e32 v9, vcc, v6, v7
-; VI-NEXT: .LBB13_5: ; %frem.loop_body
+; VI-NEXT: .LBB13_5: ; %frem.loop_body23
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v7, v5
; VI-NEXT: v_mov_b32_e32 v6, v4
@@ -3434,7 +3434,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: .LBB13_6:
; VI-NEXT: v_mov_b32_e32 v7, v5
; VI-NEXT: v_mov_b32_e32 v6, v4
-; VI-NEXT: .LBB13_7: ; %frem.loop_exit
+; VI-NEXT: .LBB13_7: ; %frem.loop_exit24
; VI-NEXT: v_add_u32_e32 v4, vcc, 0xffffffe7, v9
; VI-NEXT: v_ldexp_f64 v[4:5], v[6:7], v4
; VI-NEXT: s_mov_b32 s2, 0
@@ -3458,7 +3458,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_mov_b32 s2, 1
; VI-NEXT: ; implicit-def: $vgpr2_vgpr3
; VI-NEXT: s_cbranch_vccz .LBB13_10
-; VI-NEXT: ; %bb.9: ; %frem.else16
+; VI-NEXT: ; %bb.9: ; %frem.else
; VI-NEXT: v_mov_b32_e32 v2, s10
; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: v_cmp_eq_f64_e64 vcc, |s[6:7]|, |v[2:3]|
@@ -3476,7 +3476,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_xor_b32 s2, s2, 1
; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cbranch_scc1 .LBB13_16
-; VI-NEXT: ; %bb.11: ; %frem.compute15
+; VI-NEXT: ; %bb.11: ; %frem.compute
; VI-NEXT: v_frexp_mant_f64_e64 v[2:3], |s[6:7]|
; VI-NEXT: v_frexp_exp_i32_f64_e64 v8, |s[6:7]|
; VI-NEXT: v_frexp_exp_i32_f64_e64 v9, |s[10:11]|
@@ -3499,10 +3499,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v11
; VI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], 1.0
; VI-NEXT: s_cbranch_vccnz .LBB13_14
-; VI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; VI-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; VI-NEXT: v_add_u32_e32 v8, vcc, 26, v8
; VI-NEXT: v_sub_u32_e32 v11, vcc, v8, v9
-; VI-NEXT: .LBB13_13: ; %frem.loop_body23
+; VI-NEXT: .LBB13_13: ; %frem.loop_body
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v9, v7
; VI-NEXT: v_mov_b32_e32 v8, v6
@@ -3521,7 +3521,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: .LBB13_14:
; VI-NEXT: v_mov_b32_e32 v9, v7
; VI-NEXT: v_mov_b32_e32 v8, v6
-; VI-NEXT: .LBB13_15: ; %frem.loop_exit24
+; VI-NEXT: .LBB13_15: ; %frem.loop_exit
; VI-NEXT: v_add_u32_e32 v6, vcc, 0xffffffe7, v11
; VI-NEXT: v_ldexp_f64 v[6:7], v[8:9], v6
; VI-NEXT: s_mov_b32 s2, 0
diff --git a/llvm/test/CodeGen/AMDGPU/abs_i16.ll b/llvm/test/CodeGen/AMDGPU/abs_i16.ll
index 7633ba0..66cc7f3 100644
--- a/llvm/test/CodeGen/AMDGPU/abs_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/abs_i16.ll
@@ -15,7 +15,7 @@ define i16 @abs_i16(i16 %arg) {
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0, v0
-; GFX6-NEXT: v_max_i32_e32 v0, v0, v1
+; GFX6-NEXT: v_max_i32_e32 v0, v1, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: abs_i16:
@@ -23,7 +23,7 @@ define i16 @abs_i16(i16 %arg) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX7-NEXT: v_sub_i32_e32 v1, vcc, 0, v0
-; GFX7-NEXT: v_max_i32_e32 v0, v0, v1
+; GFX7-NEXT: v_max_i32_e32 v0, v1, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: abs_i16:
@@ -97,9 +97,9 @@ define <2 x i16> @v_abs_v2i16(<2 x i16> %arg) {
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v0
-; GFX6-NEXT: v_max_i32_e32 v0, v0, v2
+; GFX6-NEXT: v_max_i32_e32 v0, v2, v0
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v1
-; GFX6-NEXT: v_max_i32_e32 v1, v1, v2
+; GFX6-NEXT: v_max_i32_e32 v1, v2, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -110,9 +110,9 @@ define <2 x i16> @v_abs_v2i16(<2 x i16> %arg) {
; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 0, v0
-; GFX7-NEXT: v_max_i32_e32 v0, v0, v2
+; GFX7-NEXT: v_max_i32_e32 v0, v2, v0
; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 0, v1
-; GFX7-NEXT: v_max_i32_e32 v1, v1, v2
+; GFX7-NEXT: v_max_i32_e32 v1, v2, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX7-NEXT: v_or_b32_e32 v0, v0, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -172,15 +172,15 @@ define <3 x i16> @v_abs_v3i16(<3 x i16> %arg) {
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
-; GFX6-NEXT: v_max_i32_e32 v0, v0, v3
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
-; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v0, v3, v0
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; GFX6-NEXT: v_max_i32_e32 v1, v3, v1
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_max_i32_e32 v2, v3, v2
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0, v2
-; GFX6-NEXT: v_max_i32_e32 v2, v2, v1
-; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16
+; GFX6-NEXT: v_alignbit_b32 v1, v2, v1, 16
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_abs_v3i16:
@@ -189,15 +189,15 @@ define <3 x i16> @v_abs_v3i16(<3 x i16> %arg) {
; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
-; GFX7-NEXT: v_max_i32_e32 v0, v0, v3
-; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
-; GFX7-NEXT: v_max_i32_e32 v1, v1, v3
; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v0, v3, v0
+; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; GFX7-NEXT: v_max_i32_e32 v1, v3, v1
+; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_max_i32_e32 v2, v3, v2
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT: v_sub_i32_e32 v1, vcc, 0, v2
-; GFX7-NEXT: v_max_i32_e32 v2, v2, v1
-; GFX7-NEXT: v_alignbit_b32 v1, v2, v0, 16
+; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_abs_v3i16:
@@ -262,47 +262,45 @@ define <4 x i16> @v_abs_v4i16(<4 x i16> %arg) {
; GFX6-LABEL: v_abs_v4i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v0
; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v0, v4, v0
+; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v1
; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v1, v4, v1
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v2
-; GFX6-NEXT: v_max_i32_e32 v2, v2, v4
+; GFX6-NEXT: v_max_i32_e32 v2, v4, v2
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v3
-; GFX6-NEXT: v_max_i32_e32 v3, v3, v4
-; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
-; GFX6-NEXT: v_max_i32_e32 v0, v0, v3
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
-; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX6-NEXT: v_max_i32_e32 v3, v4, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v4
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX6-NEXT: v_alignbit_b32 v1, v2, v1, 16
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_abs_v4i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX7-NEXT: v_sub_i32_e32 v4, vcc, 0, v0
; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v0, v4, v0
+; GFX7-NEXT: v_sub_i32_e32 v4, vcc, 0, v1
; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v1, v4, v1
; GFX7-NEXT: v_sub_i32_e32 v4, vcc, 0, v2
-; GFX7-NEXT: v_max_i32_e32 v2, v2, v4
+; GFX7-NEXT: v_max_i32_e32 v2, v4, v2
; GFX7-NEXT: v_sub_i32_e32 v4, vcc, 0, v3
-; GFX7-NEXT: v_max_i32_e32 v3, v3, v4
-; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
-; GFX7-NEXT: v_max_i32_e32 v0, v0, v3
-; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
-; GFX7-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX7-NEXT: v_max_i32_e32 v3, v4, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v4
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT: v_alignbit_b32 v1, v2, v0, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_abs_v4i16:
@@ -370,63 +368,61 @@ define <6 x i16> @v_abs_v6i16(<6 x i16> %arg) {
; GFX6-LABEL: v_abs_v6i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0, v0
+; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v0, v6, v0
+; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0, v1
+; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v1, v6, v1
+; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0, v4
+; GFX6-NEXT: v_max_i32_e32 v4, v6, v4
+; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0, v5
+; GFX6-NEXT: v_max_i32_e32 v5, v6, v5
; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v6
; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
-; GFX6-NEXT: v_max_i32_e32 v2, v2, v6
+; GFX6-NEXT: v_max_i32_e32 v2, v6, v2
; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0, v3
-; GFX6-NEXT: v_max_i32_e32 v3, v3, v6
-; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
-; GFX6-NEXT: v_max_i32_e32 v0, v0, v3
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
-; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
-; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
-; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v3, v6, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v5
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v6
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0, v4
-; GFX6-NEXT: v_max_i32_e32 v5, v5, v3
-; GFX6-NEXT: v_max_i32_e32 v1, v4, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX6-NEXT: v_or_b32_e32 v4, v1, v3
-; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX6-NEXT: v_alignbit_b32 v1, v2, v1, 16
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_abs_v6i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 0, v0
+; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v0, v6, v0
+; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 0, v1
+; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v1, v6, v1
+; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 0, v4
+; GFX7-NEXT: v_max_i32_e32 v4, v6, v4
+; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 0, v5
+; GFX7-NEXT: v_max_i32_e32 v5, v6, v5
; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
-; GFX7-NEXT: v_max_i32_e32 v2, v2, v6
+; GFX7-NEXT: v_max_i32_e32 v2, v6, v2
; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 0, v3
-; GFX7-NEXT: v_max_i32_e32 v3, v3, v6
-; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
-; GFX7-NEXT: v_max_i32_e32 v0, v0, v3
-; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
-; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16
-; GFX7-NEXT: v_max_i32_e32 v1, v1, v3
-; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v3, v6, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v5
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v6
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT: v_sub_i32_e32 v1, vcc, 0, v4
-; GFX7-NEXT: v_max_i32_e32 v5, v5, v3
-; GFX7-NEXT: v_max_i32_e32 v1, v4, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX7-NEXT: v_or_b32_e32 v4, v1, v3
-; GFX7-NEXT: v_alignbit_b32 v1, v2, v0, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_abs_v6i16:
@@ -509,83 +505,79 @@ define <8 x i16> @v_abs_v8i16(<8 x i16> %arg) {
; GFX6-LABEL: v_abs_v8i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v0
+; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v0, v8, v0
+; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v1
+; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v1, v8, v1
+; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v4
; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v4, v8, v4
+; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v5
; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v5, v8, v5
; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v6
-; GFX6-NEXT: v_max_i32_e32 v6, v6, v8
+; GFX6-NEXT: v_max_i32_e32 v6, v8, v6
; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v7
-; GFX6-NEXT: v_max_i32_e32 v7, v7, v8
-; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
-; GFX6-NEXT: v_or_b32_e32 v6, v6, v7
-; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v4
-; GFX6-NEXT: v_max_i32_e32 v4, v4, v7
-; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v5
-; GFX6-NEXT: v_max_i32_e32 v5, v5, v7
+; GFX6-NEXT: v_max_i32_e32 v7, v8, v7
; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v7
; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16
-; GFX6-NEXT: v_or_b32_e32 v4, v4, v5
-; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
-; GFX6-NEXT: v_max_i32_e32 v2, v2, v5
-; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0, v3
-; GFX6-NEXT: v_max_i32_e32 v3, v3, v5
-; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
-; GFX6-NEXT: v_max_i32_e32 v0, v0, v3
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
-; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX6-NEXT: v_or_b32_e32 v6, v6, v8
+; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v2
+; GFX6-NEXT: v_max_i32_e32 v2, v8, v2
+; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v3
+; GFX6-NEXT: v_max_i32_e32 v3, v8, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v8
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16
-; GFX6-NEXT: v_alignbit_b32 v5, v6, v4, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX6-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; GFX6-NEXT: v_alignbit_b32 v5, v6, v5, 16
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_abs_v8i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v0
+; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v0, v8, v0
+; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v1
+; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v1, v8, v1
+; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v4
; GFX7-NEXT: v_bfe_i32 v6, v6, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v4, v8, v4
+; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v5
; GFX7-NEXT: v_bfe_i32 v7, v7, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v5, v8, v5
; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v6
-; GFX7-NEXT: v_max_i32_e32 v6, v6, v8
+; GFX7-NEXT: v_max_i32_e32 v6, v8, v6
; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v7
-; GFX7-NEXT: v_max_i32_e32 v7, v7, v8
-; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16
-; GFX7-NEXT: v_or_b32_e32 v6, v6, v7
-; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v4
-; GFX7-NEXT: v_max_i32_e32 v4, v4, v7
-; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v5
-; GFX7-NEXT: v_max_i32_e32 v5, v5, v7
+; GFX7-NEXT: v_max_i32_e32 v7, v8, v7
; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v7
; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16
-; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
-; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
-; GFX7-NEXT: v_max_i32_e32 v2, v2, v5
-; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 0, v3
-; GFX7-NEXT: v_max_i32_e32 v3, v3, v5
-; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
-; GFX7-NEXT: v_max_i32_e32 v0, v0, v3
-; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
-; GFX7-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX7-NEXT: v_or_b32_e32 v6, v6, v8
+; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v2
+; GFX7-NEXT: v_max_i32_e32 v2, v8, v2
+; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v3
+; GFX7-NEXT: v_max_i32_e32 v3, v8, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v8
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT: v_alignbit_b32 v1, v2, v0, 16
-; GFX7-NEXT: v_alignbit_b32 v5, v6, v4, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_abs_v8i16:
@@ -682,155 +674,147 @@ define <16 x i16> @v_abs_v16i16(<16 x i16> %arg) {
; GFX6-LABEL: v_abs_v16i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v0
+; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v0, v16, v0
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v1
+; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v1, v16, v1
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v4
+; GFX6-NEXT: v_bfe_i32 v8, v8, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v4, v16, v4
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v5
+; GFX6-NEXT: v_bfe_i32 v9, v9, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v5, v16, v5
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v8
+; GFX6-NEXT: v_bfe_i32 v12, v12, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v8, v16, v8
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v9
+; GFX6-NEXT: v_bfe_i32 v13, v13, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v9, v16, v9
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v12
; GFX6-NEXT: v_bfe_i32 v14, v14, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v12, v16, v12
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v13
; GFX6-NEXT: v_bfe_i32 v15, v15, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v13, v16, v13
; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v14
-; GFX6-NEXT: v_max_i32_e32 v14, v14, v16
+; GFX6-NEXT: v_max_i32_e32 v14, v16, v14
; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v15
-; GFX6-NEXT: v_max_i32_e32 v15, v15, v16
-; GFX6-NEXT: v_bfe_i32 v12, v12, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GFX6-NEXT: v_bfe_i32 v13, v13, 0, 16
-; GFX6-NEXT: v_or_b32_e32 v14, v14, v15
-; GFX6-NEXT: v_sub_i32_e32 v15, vcc, 0, v12
-; GFX6-NEXT: v_max_i32_e32 v12, v12, v15
-; GFX6-NEXT: v_sub_i32_e32 v15, vcc, 0, v13
-; GFX6-NEXT: v_max_i32_e32 v13, v13, v15
+; GFX6-NEXT: v_max_i32_e32 v15, v16, v15
; GFX6-NEXT: v_bfe_i32 v10, v10, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX6-NEXT: v_lshlrev_b32_e32 v16, 16, v15
; GFX6-NEXT: v_bfe_i32 v11, v11, 0, 16
-; GFX6-NEXT: v_or_b32_e32 v12, v12, v13
-; GFX6-NEXT: v_sub_i32_e32 v13, vcc, 0, v10
-; GFX6-NEXT: v_max_i32_e32 v10, v10, v13
-; GFX6-NEXT: v_sub_i32_e32 v13, vcc, 0, v11
-; GFX6-NEXT: v_max_i32_e32 v11, v11, v13
-; GFX6-NEXT: v_bfe_i32 v8, v8, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX6-NEXT: v_bfe_i32 v9, v9, 0, 16
-; GFX6-NEXT: v_or_b32_e32 v10, v10, v11
-; GFX6-NEXT: v_sub_i32_e32 v11, vcc, 0, v8
-; GFX6-NEXT: v_max_i32_e32 v8, v8, v11
-; GFX6-NEXT: v_sub_i32_e32 v11, vcc, 0, v9
-; GFX6-NEXT: v_max_i32_e32 v9, v9, v11
+; GFX6-NEXT: v_or_b32_e32 v14, v14, v16
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v10
+; GFX6-NEXT: v_max_i32_e32 v10, v16, v10
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v11
+; GFX6-NEXT: v_max_i32_e32 v11, v16, v11
; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX6-NEXT: v_lshlrev_b32_e32 v16, 16, v11
; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16
-; GFX6-NEXT: v_or_b32_e32 v8, v8, v9
-; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 0, v6
-; GFX6-NEXT: v_max_i32_e32 v6, v6, v9
-; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 0, v7
-; GFX6-NEXT: v_max_i32_e32 v7, v7, v9
-; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
-; GFX6-NEXT: v_or_b32_e32 v6, v6, v7
-; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v4
-; GFX6-NEXT: v_max_i32_e32 v4, v4, v7
-; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v5
-; GFX6-NEXT: v_max_i32_e32 v5, v5, v7
+; GFX6-NEXT: v_or_b32_e32 v10, v10, v16
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v6
+; GFX6-NEXT: v_max_i32_e32 v6, v16, v6
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v7
+; GFX6-NEXT: v_max_i32_e32 v7, v16, v7
; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v16, 16, v7
; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16
-; GFX6-NEXT: v_or_b32_e32 v4, v4, v5
-; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
-; GFX6-NEXT: v_max_i32_e32 v2, v2, v5
-; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0, v3
-; GFX6-NEXT: v_max_i32_e32 v3, v3, v5
-; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
-; GFX6-NEXT: v_max_i32_e32 v0, v0, v3
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
-; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX6-NEXT: v_or_b32_e32 v6, v6, v16
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v2
+; GFX6-NEXT: v_max_i32_e32 v2, v16, v2
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v3
+; GFX6-NEXT: v_max_i32_e32 v3, v16, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v16, 16, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX6-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v16
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16
-; GFX6-NEXT: v_alignbit_b32 v5, v6, v4, 16
-; GFX6-NEXT: v_alignbit_b32 v9, v10, v8, 16
-; GFX6-NEXT: v_alignbit_b32 v13, v14, v12, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; GFX6-NEXT: v_lshrrev_b32_e32 v11, 16, v10
-; GFX6-NEXT: v_lshrrev_b32_e32 v15, 16, v14
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX6-NEXT: v_or_b32_e32 v8, v8, v9
+; GFX6-NEXT: v_or_b32_e32 v12, v12, v13
+; GFX6-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; GFX6-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; GFX6-NEXT: v_alignbit_b32 v9, v10, v9, 16
+; GFX6-NEXT: v_alignbit_b32 v13, v14, v13, 16
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_abs_v16i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v0
+; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v0, v16, v0
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v1
+; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v1, v16, v1
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v4
+; GFX7-NEXT: v_bfe_i32 v8, v8, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v4, v16, v4
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v5
+; GFX7-NEXT: v_bfe_i32 v9, v9, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v5, v16, v5
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v8
+; GFX7-NEXT: v_bfe_i32 v12, v12, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v8, v16, v8
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v9
+; GFX7-NEXT: v_bfe_i32 v13, v13, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v9, v16, v9
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v12
; GFX7-NEXT: v_bfe_i32 v14, v14, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v12, v16, v12
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v13
; GFX7-NEXT: v_bfe_i32 v15, v15, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v13, v16, v13
; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v14
-; GFX7-NEXT: v_max_i32_e32 v14, v14, v16
+; GFX7-NEXT: v_max_i32_e32 v14, v16, v14
; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v15
-; GFX7-NEXT: v_max_i32_e32 v15, v15, v16
-; GFX7-NEXT: v_bfe_i32 v12, v12, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GFX7-NEXT: v_bfe_i32 v13, v13, 0, 16
-; GFX7-NEXT: v_or_b32_e32 v14, v14, v15
-; GFX7-NEXT: v_sub_i32_e32 v15, vcc, 0, v12
-; GFX7-NEXT: v_max_i32_e32 v12, v12, v15
-; GFX7-NEXT: v_sub_i32_e32 v15, vcc, 0, v13
-; GFX7-NEXT: v_max_i32_e32 v13, v13, v15
+; GFX7-NEXT: v_max_i32_e32 v15, v16, v15
; GFX7-NEXT: v_bfe_i32 v10, v10, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v15
; GFX7-NEXT: v_bfe_i32 v11, v11, 0, 16
-; GFX7-NEXT: v_or_b32_e32 v12, v12, v13
-; GFX7-NEXT: v_sub_i32_e32 v13, vcc, 0, v10
-; GFX7-NEXT: v_max_i32_e32 v10, v10, v13
-; GFX7-NEXT: v_sub_i32_e32 v13, vcc, 0, v11
-; GFX7-NEXT: v_max_i32_e32 v11, v11, v13
-; GFX7-NEXT: v_bfe_i32 v8, v8, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX7-NEXT: v_bfe_i32 v9, v9, 0, 16
-; GFX7-NEXT: v_or_b32_e32 v10, v10, v11
-; GFX7-NEXT: v_sub_i32_e32 v11, vcc, 0, v8
-; GFX7-NEXT: v_max_i32_e32 v8, v8, v11
-; GFX7-NEXT: v_sub_i32_e32 v11, vcc, 0, v9
-; GFX7-NEXT: v_max_i32_e32 v9, v9, v11
+; GFX7-NEXT: v_or_b32_e32 v14, v14, v16
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v10
+; GFX7-NEXT: v_max_i32_e32 v10, v16, v10
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v11
+; GFX7-NEXT: v_max_i32_e32 v11, v16, v11
; GFX7-NEXT: v_bfe_i32 v6, v6, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v11
; GFX7-NEXT: v_bfe_i32 v7, v7, 0, 16
-; GFX7-NEXT: v_or_b32_e32 v8, v8, v9
-; GFX7-NEXT: v_sub_i32_e32 v9, vcc, 0, v6
-; GFX7-NEXT: v_max_i32_e32 v6, v6, v9
-; GFX7-NEXT: v_sub_i32_e32 v9, vcc, 0, v7
-; GFX7-NEXT: v_max_i32_e32 v7, v7, v9
-; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16
-; GFX7-NEXT: v_or_b32_e32 v6, v6, v7
-; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v4
-; GFX7-NEXT: v_max_i32_e32 v4, v4, v7
-; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v5
-; GFX7-NEXT: v_max_i32_e32 v5, v5, v7
+; GFX7-NEXT: v_or_b32_e32 v10, v10, v16
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v6
+; GFX7-NEXT: v_max_i32_e32 v6, v16, v6
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v7
+; GFX7-NEXT: v_max_i32_e32 v7, v16, v7
; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v7
; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16
-; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
-; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
-; GFX7-NEXT: v_max_i32_e32 v2, v2, v5
-; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 0, v3
-; GFX7-NEXT: v_max_i32_e32 v3, v3, v5
-; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
-; GFX7-NEXT: v_max_i32_e32 v0, v0, v3
-; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
-; GFX7-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX7-NEXT: v_or_b32_e32 v6, v6, v16
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v2
+; GFX7-NEXT: v_max_i32_e32 v2, v16, v2
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v3
+; GFX7-NEXT: v_max_i32_e32 v3, v16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v16
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT: v_alignbit_b32 v1, v2, v0, 16
-; GFX7-NEXT: v_alignbit_b32 v5, v6, v4, 16
-; GFX7-NEXT: v_alignbit_b32 v9, v10, v8, 16
-; GFX7-NEXT: v_alignbit_b32 v13, v14, v12, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v10
-; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v14
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT: v_or_b32_e32 v8, v8, v9
+; GFX7-NEXT: v_or_b32_e32 v12, v12, v13
+; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; GFX7-NEXT: v_alignbit_b32 v9, v10, v9, 16
+; GFX7-NEXT: v_alignbit_b32 v13, v14, v13, 16
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_abs_v16i16:
@@ -974,303 +958,287 @@ define <32 x i16> @v_abs_v32i16(<32 x i16> %arg) {
; GFX6-LABEL: v_abs_v32i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v0
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v0, v31, v0
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v1
+; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v1, v31, v1
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v4
+; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v4, v31, v4
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v5
+; GFX6-NEXT: v_bfe_i32 v8, v8, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v5, v31, v5
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v8
+; GFX6-NEXT: v_bfe_i32 v9, v9, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v8, v31, v8
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v9
+; GFX6-NEXT: v_bfe_i32 v12, v12, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v9, v31, v9
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v12
+; GFX6-NEXT: v_bfe_i32 v13, v13, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v12, v31, v12
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v13
+; GFX6-NEXT: v_bfe_i32 v16, v16, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v13, v31, v13
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v16
+; GFX6-NEXT: v_bfe_i32 v17, v17, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v16, v31, v16
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v17
+; GFX6-NEXT: v_bfe_i32 v20, v20, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v17, v31, v17
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v20
+; GFX6-NEXT: v_bfe_i32 v21, v21, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v20, v31, v20
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v21
+; GFX6-NEXT: v_bfe_i32 v24, v24, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v21, v31, v21
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v24
+; GFX6-NEXT: v_bfe_i32 v25, v25, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v24, v31, v24
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v25
; GFX6-NEXT: v_bfe_i32 v28, v28, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v25, v31, v25
; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v28
; GFX6-NEXT: v_bfe_i32 v29, v29, 0, 16
-; GFX6-NEXT: v_max_i32_e32 v28, v28, v31
+; GFX6-NEXT: v_max_i32_e32 v28, v31, v28
; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v29
; GFX6-NEXT: v_bfe_i32 v30, v30, 0, 16
-; GFX6-NEXT: v_max_i32_e32 v29, v29, v31
+; GFX6-NEXT: v_max_i32_e32 v29, v31, v29
; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v30
+; GFX6-NEXT: v_max_i32_e32 v30, v31, v30
+; GFX6-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX6-NEXT: v_bfe_i32 v26, v26, 0, 16
-; GFX6-NEXT: v_max_i32_e32 v30, v30, v31
-; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v26
; GFX6-NEXT: v_bfe_i32 v27, v27, 0, 16
-; GFX6-NEXT: v_max_i32_e32 v26, v26, v31
-; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v27
-; GFX6-NEXT: v_bfe_i32 v24, v24, 0, 16
-; GFX6-NEXT: v_max_i32_e32 v27, v27, v31
-; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v24
-; GFX6-NEXT: v_bfe_i32 v25, v25, 0, 16
-; GFX6-NEXT: v_max_i32_e32 v24, v24, v31
-; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v25
; GFX6-NEXT: v_bfe_i32 v22, v22, 0, 16
-; GFX6-NEXT: v_max_i32_e32 v25, v25, v31
-; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v22
; GFX6-NEXT: v_bfe_i32 v23, v23, 0, 16
-; GFX6-NEXT: v_max_i32_e32 v22, v22, v31
-; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v23
-; GFX6-NEXT: v_max_i32_e32 v23, v23, v31
-; GFX6-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX6-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX6-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; GFX6-NEXT: v_or_b32_e32 v22, v22, v23
-; GFX6-NEXT: v_or_b32_e32 v24, v24, v25
-; GFX6-NEXT: v_bfe_i32 v21, v21, 0, 16
-; GFX6-NEXT: v_bfe_i32 v20, v20, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v29, 16, v29
-; GFX6-NEXT: v_or_b32_e32 v28, v28, v29
-; GFX6-NEXT: v_sub_i32_e32 v29, vcc, 0, v20
-; GFX6-NEXT: v_max_i32_e32 v20, v20, v29
; GFX6-NEXT: v_bfe_i32 v18, v18, 0, 16
; GFX6-NEXT: v_bfe_i32 v19, v19, 0, 16
-; GFX6-NEXT: v_bfe_i32 v16, v16, 0, 16
-; GFX6-NEXT: v_bfe_i32 v17, v17, 0, 16
; GFX6-NEXT: v_bfe_i32 v14, v14, 0, 16
; GFX6-NEXT: v_bfe_i32 v15, v15, 0, 16
-; GFX6-NEXT: v_bfe_i32 v12, v12, 0, 16
-; GFX6-NEXT: v_bfe_i32 v13, v13, 0, 16
; GFX6-NEXT: v_bfe_i32 v10, v10, 0, 16
; GFX6-NEXT: v_bfe_i32 v11, v11, 0, 16
-; GFX6-NEXT: v_bfe_i32 v8, v8, 0, 16
-; GFX6-NEXT: v_bfe_i32 v9, v9, 0, 16
; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16
; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16
-; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
-; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16
-; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v27, 16, v27
-; GFX6-NEXT: v_or_b32_e32 v26, v26, v27
-; GFX6-NEXT: v_lshrrev_b32_e32 v27, 16, v26
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_bfe_i32 v23, v31, 0, 16
-; GFX6-NEXT: v_sub_i32_e32 v25, vcc, 0, v23
-; GFX6-NEXT: v_max_i32_e32 v23, v23, v25
-; GFX6-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX6-NEXT: v_or_b32_e32 v30, v30, v23
-; GFX6-NEXT: v_sub_i32_e32 v23, vcc, 0, v21
-; GFX6-NEXT: v_max_i32_e32 v21, v21, v23
-; GFX6-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX6-NEXT: v_or_b32_e32 v20, v20, v21
-; GFX6-NEXT: v_sub_i32_e32 v21, vcc, 0, v18
-; GFX6-NEXT: v_max_i32_e32 v18, v18, v21
-; GFX6-NEXT: v_sub_i32_e32 v21, vcc, 0, v19
-; GFX6-NEXT: v_max_i32_e32 v19, v19, v21
-; GFX6-NEXT: v_lshlrev_b32_e32 v19, 16, v19
-; GFX6-NEXT: v_or_b32_e32 v18, v18, v19
-; GFX6-NEXT: v_sub_i32_e32 v19, vcc, 0, v16
-; GFX6-NEXT: v_max_i32_e32 v16, v16, v19
-; GFX6-NEXT: v_sub_i32_e32 v19, vcc, 0, v17
-; GFX6-NEXT: v_max_i32_e32 v17, v17, v19
-; GFX6-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX6-NEXT: v_or_b32_e32 v16, v16, v17
-; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 0, v14
-; GFX6-NEXT: v_max_i32_e32 v14, v14, v17
-; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 0, v15
-; GFX6-NEXT: v_max_i32_e32 v15, v15, v17
-; GFX6-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GFX6-NEXT: v_or_b32_e32 v14, v14, v15
-; GFX6-NEXT: v_sub_i32_e32 v15, vcc, 0, v12
-; GFX6-NEXT: v_max_i32_e32 v12, v12, v15
-; GFX6-NEXT: v_sub_i32_e32 v15, vcc, 0, v13
-; GFX6-NEXT: v_max_i32_e32 v13, v13, v15
-; GFX6-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; GFX6-NEXT: v_or_b32_e32 v12, v12, v13
-; GFX6-NEXT: v_sub_i32_e32 v13, vcc, 0, v10
-; GFX6-NEXT: v_max_i32_e32 v10, v10, v13
-; GFX6-NEXT: v_sub_i32_e32 v13, vcc, 0, v11
-; GFX6-NEXT: v_max_i32_e32 v11, v11, v13
-; GFX6-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX6-NEXT: v_or_b32_e32 v10, v10, v11
-; GFX6-NEXT: v_sub_i32_e32 v11, vcc, 0, v8
-; GFX6-NEXT: v_max_i32_e32 v8, v8, v11
-; GFX6-NEXT: v_sub_i32_e32 v11, vcc, 0, v9
-; GFX6-NEXT: v_max_i32_e32 v9, v9, v11
-; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX6-NEXT: v_or_b32_e32 v8, v8, v9
-; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 0, v6
-; GFX6-NEXT: v_max_i32_e32 v6, v6, v9
-; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 0, v7
-; GFX6-NEXT: v_max_i32_e32 v7, v7, v9
-; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX6-NEXT: v_or_b32_e32 v6, v6, v7
-; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v4
-; GFX6-NEXT: v_max_i32_e32 v4, v4, v7
-; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v5
-; GFX6-NEXT: v_max_i32_e32 v5, v5, v7
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX6-NEXT: v_or_b32_e32 v4, v4, v5
-; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
-; GFX6-NEXT: v_max_i32_e32 v2, v2, v5
-; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0, v3
-; GFX6-NEXT: v_max_i32_e32 v3, v3, v5
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
-; GFX6-NEXT: v_max_i32_e32 v0, v0, v3
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
-; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX6-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX6-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX6-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX6-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX6-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16
-; GFX6-NEXT: v_alignbit_b32 v5, v6, v4, 16
-; GFX6-NEXT: v_alignbit_b32 v9, v10, v8, 16
-; GFX6-NEXT: v_alignbit_b32 v13, v14, v12, 16
-; GFX6-NEXT: v_alignbit_b32 v17, v18, v16, 16
-; GFX6-NEXT: v_alignbit_b32 v21, v22, v20, 16
-; GFX6-NEXT: v_alignbit_b32 v25, v26, v24, 16
-; GFX6-NEXT: v_alignbit_b32 v29, v30, v28, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; GFX6-NEXT: v_lshrrev_b32_e32 v11, 16, v10
-; GFX6-NEXT: v_lshrrev_b32_e32 v15, 16, v14
-; GFX6-NEXT: v_lshrrev_b32_e32 v19, 16, v18
-; GFX6-NEXT: v_lshrrev_b32_e32 v23, 16, v22
-; GFX6-NEXT: v_lshrrev_b32_e32 v31, 16, v30
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX6-NEXT: v_or_b32_e32 v8, v8, v9
+; GFX6-NEXT: v_or_b32_e32 v12, v12, v13
+; GFX6-NEXT: v_or_b32_e32 v16, v16, v17
+; GFX6-NEXT: v_or_b32_e32 v20, v20, v21
+; GFX6-NEXT: v_or_b32_e32 v24, v24, v25
+; GFX6-NEXT: v_or_b32_e32 v28, v28, v29
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_bfe_i32 v31, v31, 0, 16
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v31
+; GFX6-NEXT: v_max_i32_e32 v31, v32, v31
+; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v31
+; GFX6-NEXT: v_or_b32_e32 v30, v30, v32
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v26
+; GFX6-NEXT: v_max_i32_e32 v26, v32, v26
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v27
+; GFX6-NEXT: v_max_i32_e32 v27, v32, v27
+; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v27
+; GFX6-NEXT: v_or_b32_e32 v26, v26, v32
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v22
+; GFX6-NEXT: v_max_i32_e32 v22, v32, v22
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v23
+; GFX6-NEXT: v_max_i32_e32 v23, v32, v23
+; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v23
+; GFX6-NEXT: v_or_b32_e32 v22, v22, v32
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v18
+; GFX6-NEXT: v_max_i32_e32 v18, v32, v18
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v19
+; GFX6-NEXT: v_max_i32_e32 v19, v32, v19
+; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v19
+; GFX6-NEXT: v_or_b32_e32 v18, v18, v32
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v14
+; GFX6-NEXT: v_max_i32_e32 v14, v32, v14
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v15
+; GFX6-NEXT: v_max_i32_e32 v15, v32, v15
+; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v15
+; GFX6-NEXT: v_or_b32_e32 v14, v14, v32
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v10
+; GFX6-NEXT: v_max_i32_e32 v10, v32, v10
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v11
+; GFX6-NEXT: v_max_i32_e32 v11, v32, v11
+; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v11
+; GFX6-NEXT: v_or_b32_e32 v10, v10, v32
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v6
+; GFX6-NEXT: v_max_i32_e32 v6, v32, v6
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v7
+; GFX6-NEXT: v_max_i32_e32 v7, v32, v7
+; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v7
+; GFX6-NEXT: v_or_b32_e32 v6, v6, v32
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v2
+; GFX6-NEXT: v_max_i32_e32 v2, v32, v2
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v3
+; GFX6-NEXT: v_max_i32_e32 v3, v32, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v3
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v32
+; GFX6-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; GFX6-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; GFX6-NEXT: v_alignbit_b32 v9, v10, v9, 16
+; GFX6-NEXT: v_alignbit_b32 v13, v14, v13, 16
+; GFX6-NEXT: v_alignbit_b32 v17, v18, v17, 16
+; GFX6-NEXT: v_alignbit_b32 v21, v22, v21, 16
+; GFX6-NEXT: v_alignbit_b32 v25, v26, v25, 16
+; GFX6-NEXT: v_alignbit_b32 v29, v30, v29, 16
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_abs_v32i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v0
+; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v0, v31, v0
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v1
+; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v1, v31, v1
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v4
+; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v4, v31, v4
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v5
+; GFX7-NEXT: v_bfe_i32 v8, v8, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v5, v31, v5
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v8
+; GFX7-NEXT: v_bfe_i32 v9, v9, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v8, v31, v8
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v9
+; GFX7-NEXT: v_bfe_i32 v12, v12, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v9, v31, v9
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v12
+; GFX7-NEXT: v_bfe_i32 v13, v13, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v12, v31, v12
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v13
+; GFX7-NEXT: v_bfe_i32 v16, v16, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v13, v31, v13
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v16
+; GFX7-NEXT: v_bfe_i32 v17, v17, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v16, v31, v16
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v17
+; GFX7-NEXT: v_bfe_i32 v20, v20, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v17, v31, v17
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v20
+; GFX7-NEXT: v_bfe_i32 v21, v21, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v20, v31, v20
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v21
+; GFX7-NEXT: v_bfe_i32 v24, v24, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v21, v31, v21
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v24
+; GFX7-NEXT: v_bfe_i32 v25, v25, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v24, v31, v24
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v25
; GFX7-NEXT: v_bfe_i32 v28, v28, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v25, v31, v25
; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v28
; GFX7-NEXT: v_bfe_i32 v29, v29, 0, 16
-; GFX7-NEXT: v_max_i32_e32 v28, v28, v31
+; GFX7-NEXT: v_max_i32_e32 v28, v31, v28
; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v29
; GFX7-NEXT: v_bfe_i32 v30, v30, 0, 16
-; GFX7-NEXT: v_max_i32_e32 v29, v29, v31
+; GFX7-NEXT: v_max_i32_e32 v29, v31, v29
; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v30
+; GFX7-NEXT: v_max_i32_e32 v30, v31, v30
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX7-NEXT: v_bfe_i32 v26, v26, 0, 16
-; GFX7-NEXT: v_max_i32_e32 v30, v30, v31
-; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v26
; GFX7-NEXT: v_bfe_i32 v27, v27, 0, 16
-; GFX7-NEXT: v_max_i32_e32 v26, v26, v31
-; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v27
-; GFX7-NEXT: v_bfe_i32 v24, v24, 0, 16
-; GFX7-NEXT: v_max_i32_e32 v27, v27, v31
-; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v24
-; GFX7-NEXT: v_bfe_i32 v25, v25, 0, 16
-; GFX7-NEXT: v_max_i32_e32 v24, v24, v31
-; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v25
; GFX7-NEXT: v_bfe_i32 v22, v22, 0, 16
-; GFX7-NEXT: v_max_i32_e32 v25, v25, v31
-; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v22
; GFX7-NEXT: v_bfe_i32 v23, v23, 0, 16
-; GFX7-NEXT: v_max_i32_e32 v22, v22, v31
-; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v23
-; GFX7-NEXT: v_max_i32_e32 v23, v23, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; GFX7-NEXT: v_or_b32_e32 v22, v22, v23
-; GFX7-NEXT: v_or_b32_e32 v24, v24, v25
-; GFX7-NEXT: v_bfe_i32 v21, v21, 0, 16
-; GFX7-NEXT: v_bfe_i32 v20, v20, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v29
-; GFX7-NEXT: v_or_b32_e32 v28, v28, v29
-; GFX7-NEXT: v_sub_i32_e32 v29, vcc, 0, v20
-; GFX7-NEXT: v_max_i32_e32 v20, v20, v29
; GFX7-NEXT: v_bfe_i32 v18, v18, 0, 16
; GFX7-NEXT: v_bfe_i32 v19, v19, 0, 16
-; GFX7-NEXT: v_bfe_i32 v16, v16, 0, 16
-; GFX7-NEXT: v_bfe_i32 v17, v17, 0, 16
; GFX7-NEXT: v_bfe_i32 v14, v14, 0, 16
; GFX7-NEXT: v_bfe_i32 v15, v15, 0, 16
-; GFX7-NEXT: v_bfe_i32 v12, v12, 0, 16
-; GFX7-NEXT: v_bfe_i32 v13, v13, 0, 16
; GFX7-NEXT: v_bfe_i32 v10, v10, 0, 16
; GFX7-NEXT: v_bfe_i32 v11, v11, 0, 16
-; GFX7-NEXT: v_bfe_i32 v8, v8, 0, 16
-; GFX7-NEXT: v_bfe_i32 v9, v9, 0, 16
; GFX7-NEXT: v_bfe_i32 v6, v6, 0, 16
; GFX7-NEXT: v_bfe_i32 v7, v7, 0, 16
-; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16
-; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16
; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16
; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16
-; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v27, 16, v27
-; GFX7-NEXT: v_or_b32_e32 v26, v26, v27
-; GFX7-NEXT: v_lshrrev_b32_e32 v27, 16, v26
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_bfe_i32 v23, v31, 0, 16
-; GFX7-NEXT: v_sub_i32_e32 v25, vcc, 0, v23
-; GFX7-NEXT: v_max_i32_e32 v23, v23, v25
-; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX7-NEXT: v_or_b32_e32 v30, v30, v23
-; GFX7-NEXT: v_sub_i32_e32 v23, vcc, 0, v21
-; GFX7-NEXT: v_max_i32_e32 v21, v21, v23
-; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX7-NEXT: v_or_b32_e32 v20, v20, v21
-; GFX7-NEXT: v_sub_i32_e32 v21, vcc, 0, v18
-; GFX7-NEXT: v_max_i32_e32 v18, v18, v21
-; GFX7-NEXT: v_sub_i32_e32 v21, vcc, 0, v19
-; GFX7-NEXT: v_max_i32_e32 v19, v19, v21
-; GFX7-NEXT: v_lshlrev_b32_e32 v19, 16, v19
-; GFX7-NEXT: v_or_b32_e32 v18, v18, v19
-; GFX7-NEXT: v_sub_i32_e32 v19, vcc, 0, v16
-; GFX7-NEXT: v_max_i32_e32 v16, v16, v19
-; GFX7-NEXT: v_sub_i32_e32 v19, vcc, 0, v17
-; GFX7-NEXT: v_max_i32_e32 v17, v17, v19
-; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX7-NEXT: v_or_b32_e32 v16, v16, v17
-; GFX7-NEXT: v_sub_i32_e32 v17, vcc, 0, v14
-; GFX7-NEXT: v_max_i32_e32 v14, v14, v17
-; GFX7-NEXT: v_sub_i32_e32 v17, vcc, 0, v15
-; GFX7-NEXT: v_max_i32_e32 v15, v15, v17
-; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GFX7-NEXT: v_or_b32_e32 v14, v14, v15
-; GFX7-NEXT: v_sub_i32_e32 v15, vcc, 0, v12
-; GFX7-NEXT: v_max_i32_e32 v12, v12, v15
-; GFX7-NEXT: v_sub_i32_e32 v15, vcc, 0, v13
-; GFX7-NEXT: v_max_i32_e32 v13, v13, v15
-; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; GFX7-NEXT: v_or_b32_e32 v12, v12, v13
-; GFX7-NEXT: v_sub_i32_e32 v13, vcc, 0, v10
-; GFX7-NEXT: v_max_i32_e32 v10, v10, v13
-; GFX7-NEXT: v_sub_i32_e32 v13, vcc, 0, v11
-; GFX7-NEXT: v_max_i32_e32 v11, v11, v13
-; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX7-NEXT: v_or_b32_e32 v10, v10, v11
-; GFX7-NEXT: v_sub_i32_e32 v11, vcc, 0, v8
-; GFX7-NEXT: v_max_i32_e32 v8, v8, v11
-; GFX7-NEXT: v_sub_i32_e32 v11, vcc, 0, v9
-; GFX7-NEXT: v_max_i32_e32 v9, v9, v11
-; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX7-NEXT: v_or_b32_e32 v8, v8, v9
-; GFX7-NEXT: v_sub_i32_e32 v9, vcc, 0, v6
-; GFX7-NEXT: v_max_i32_e32 v6, v6, v9
-; GFX7-NEXT: v_sub_i32_e32 v9, vcc, 0, v7
-; GFX7-NEXT: v_max_i32_e32 v7, v7, v9
-; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX7-NEXT: v_or_b32_e32 v6, v6, v7
-; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v4
-; GFX7-NEXT: v_max_i32_e32 v4, v4, v7
-; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v5
-; GFX7-NEXT: v_max_i32_e32 v5, v5, v7
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
-; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
-; GFX7-NEXT: v_max_i32_e32 v2, v2, v5
-; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 0, v3
-; GFX7-NEXT: v_max_i32_e32 v3, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
-; GFX7-NEXT: v_max_i32_e32 v0, v0, v3
-; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
-; GFX7-NEXT: v_max_i32_e32 v1, v1, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT: v_alignbit_b32 v1, v2, v0, 16
-; GFX7-NEXT: v_alignbit_b32 v5, v6, v4, 16
-; GFX7-NEXT: v_alignbit_b32 v9, v10, v8, 16
-; GFX7-NEXT: v_alignbit_b32 v13, v14, v12, 16
-; GFX7-NEXT: v_alignbit_b32 v17, v18, v16, 16
-; GFX7-NEXT: v_alignbit_b32 v21, v22, v20, 16
-; GFX7-NEXT: v_alignbit_b32 v25, v26, v24, 16
-; GFX7-NEXT: v_alignbit_b32 v29, v30, v28, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v10
-; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v14
-; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v18
-; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v22
-; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v30
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT: v_or_b32_e32 v8, v8, v9
+; GFX7-NEXT: v_or_b32_e32 v12, v12, v13
+; GFX7-NEXT: v_or_b32_e32 v16, v16, v17
+; GFX7-NEXT: v_or_b32_e32 v20, v20, v21
+; GFX7-NEXT: v_or_b32_e32 v24, v24, v25
+; GFX7-NEXT: v_or_b32_e32 v28, v28, v29
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_bfe_i32 v31, v31, 0, 16
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v31
+; GFX7-NEXT: v_max_i32_e32 v31, v32, v31
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v31
+; GFX7-NEXT: v_or_b32_e32 v30, v30, v32
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v26
+; GFX7-NEXT: v_max_i32_e32 v26, v32, v26
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v27
+; GFX7-NEXT: v_max_i32_e32 v27, v32, v27
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v27
+; GFX7-NEXT: v_or_b32_e32 v26, v26, v32
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v22
+; GFX7-NEXT: v_max_i32_e32 v22, v32, v22
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v23
+; GFX7-NEXT: v_max_i32_e32 v23, v32, v23
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v23
+; GFX7-NEXT: v_or_b32_e32 v22, v22, v32
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v18
+; GFX7-NEXT: v_max_i32_e32 v18, v32, v18
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v19
+; GFX7-NEXT: v_max_i32_e32 v19, v32, v19
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v19
+; GFX7-NEXT: v_or_b32_e32 v18, v18, v32
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v14
+; GFX7-NEXT: v_max_i32_e32 v14, v32, v14
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v15
+; GFX7-NEXT: v_max_i32_e32 v15, v32, v15
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v15
+; GFX7-NEXT: v_or_b32_e32 v14, v14, v32
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v10
+; GFX7-NEXT: v_max_i32_e32 v10, v32, v10
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v11
+; GFX7-NEXT: v_max_i32_e32 v11, v32, v11
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v11
+; GFX7-NEXT: v_or_b32_e32 v10, v10, v32
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v6
+; GFX7-NEXT: v_max_i32_e32 v6, v32, v6
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v7
+; GFX7-NEXT: v_max_i32_e32 v7, v32, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v7
+; GFX7-NEXT: v_or_b32_e32 v6, v6, v32
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v2
+; GFX7-NEXT: v_max_i32_e32 v2, v32, v2
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v3
+; GFX7-NEXT: v_max_i32_e32 v3, v32, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v3
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v32
+; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; GFX7-NEXT: v_alignbit_b32 v9, v10, v9, 16
+; GFX7-NEXT: v_alignbit_b32 v13, v14, v13, 16
+; GFX7-NEXT: v_alignbit_b32 v17, v18, v17, 16
+; GFX7-NEXT: v_alignbit_b32 v21, v22, v21, 16
+; GFX7-NEXT: v_alignbit_b32 v25, v26, v25, 16
+; GFX7-NEXT: v_alignbit_b32 v29, v30, v29, 16
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_abs_v32i16:
diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
index d25bfbb..12309f3 100644
--- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -780,7 +780,7 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
@@ -790,11 +790,9 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v1, v0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v2, 16, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.h
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1
; GFX11-TRUE16-NEXT: global_store_b128 v1, v[0:3], s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll
index f96a6f7..b239c46 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll
@@ -1,13 +1,89 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefix=GCN %s
-; GCN-LABEL: {{^}}kernel_ieee_mode_default:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
define amdgpu_kernel void @kernel_ieee_mode_default() #0 {
+; GCN-LABEL: kernel_ieee_mode_default:
+; GCN: .amd_kernel_code_t
+; GCN-NEXT: amd_code_version_major = 1
+; GCN-NEXT: amd_code_version_minor = 2
+; GCN-NEXT: amd_machine_kind = 1
+; GCN-NEXT: amd_machine_version_major = 6
+; GCN-NEXT: amd_machine_version_minor = 0
+; GCN-NEXT: amd_machine_version_stepping = 0
+; GCN-NEXT: kernel_code_entry_byte_offset = 256
+; GCN-NEXT: kernel_code_prefetch_byte_size = 0
+; GCN-NEXT: granulated_workitem_vgpr_count = 0
+; GCN-NEXT: granulated_wavefront_sgpr_count = 0
+; GCN-NEXT: priority = 0
+; GCN-NEXT: float_mode = 240
+; GCN-NEXT: priv = 0
+; GCN-NEXT: enable_dx10_clamp = 1
+; GCN-NEXT: debug_mode = 0
+; GCN-NEXT: enable_ieee_mode = 1
+; GCN-NEXT: enable_wgp_mode = 0
+; GCN-NEXT: enable_mem_ordered = 0
+; GCN-NEXT: enable_fwd_progress = 0
+; GCN-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
+; GCN-NEXT: user_sgpr_count = 12
+; GCN-NEXT: enable_trap_handler = 0
+; GCN-NEXT: enable_sgpr_workgroup_id_x = 1
+; GCN-NEXT: enable_sgpr_workgroup_id_y = 1
+; GCN-NEXT: enable_sgpr_workgroup_id_z = 1
+; GCN-NEXT: enable_sgpr_workgroup_info = 0
+; GCN-NEXT: enable_vgpr_workitem_id = 2
+; GCN-NEXT: enable_exception_msb = 0
+; GCN-NEXT: granulated_lds_size = 0
+; GCN-NEXT: enable_exception = 0
+; GCN-NEXT: enable_sgpr_private_segment_buffer = 1
+; GCN-NEXT: enable_sgpr_dispatch_ptr = 1
+; GCN-NEXT: enable_sgpr_queue_ptr = 1
+; GCN-NEXT: enable_sgpr_kernarg_segment_ptr = 1
+; GCN-NEXT: enable_sgpr_dispatch_id = 1
+; GCN-NEXT: enable_sgpr_flat_scratch_init = 0
+; GCN-NEXT: enable_sgpr_private_segment_size = 0
+; GCN-NEXT: enable_sgpr_grid_workgroup_count_x = 0
+; GCN-NEXT: enable_sgpr_grid_workgroup_count_y = 0
+; GCN-NEXT: enable_sgpr_grid_workgroup_count_z = 0
+; GCN-NEXT: enable_wavefront_size32 = 0
+; GCN-NEXT: enable_ordered_append_gds = 0
+; GCN-NEXT: private_element_size = 1
+; GCN-NEXT: is_ptr64 = 1
+; GCN-NEXT: is_dynamic_callstack = 0
+; GCN-NEXT: is_debug_enabled = 0
+; GCN-NEXT: is_xnack_enabled = 0
+; GCN-NEXT: workitem_private_segment_byte_size = 0
+; GCN-NEXT: workgroup_group_segment_byte_size = 0
+; GCN-NEXT: gds_segment_byte_size = 0
+; GCN-NEXT: kernarg_segment_byte_size = 16
+; GCN-NEXT: workgroup_fbarrier_count = 0
+; GCN-NEXT: wavefront_sgpr_count = 4
+; GCN-NEXT: workitem_vgpr_count = 2
+; GCN-NEXT: reserved_vgpr_first = 0
+; GCN-NEXT: reserved_vgpr_count = 0
+; GCN-NEXT: reserved_sgpr_first = 0
+; GCN-NEXT: reserved_sgpr_count = 0
+; GCN-NEXT: debug_wavefront_private_segment_offset_sgpr = 0
+; GCN-NEXT: debug_private_segment_buffer_sgpr = 0
+; GCN-NEXT: kernarg_segment_alignment = 4
+; GCN-NEXT: group_segment_alignment = 4
+; GCN-NEXT: private_segment_alignment = 4
+; GCN-NEXT: wavefront_size = 6
+; GCN-NEXT: call_convention = -1
+; GCN-NEXT: runtime_loader_kernel_symbol = 0
+; GCN-NEXT: .end_amd_kernel_code_t
+; GCN-NEXT: ; %bb.0:
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
%val0 = load volatile float, ptr addrspace(1) poison
%val1 = load volatile float, ptr addrspace(1) poison
%min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -15,14 +91,89 @@ define amdgpu_kernel void @kernel_ieee_mode_default() #0 {
ret void
}
-; GCN-LABEL: {{^}}kernel_ieee_mode_on:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
define amdgpu_kernel void @kernel_ieee_mode_on() #1 {
+; GCN-LABEL: kernel_ieee_mode_on:
+; GCN: .amd_kernel_code_t
+; GCN-NEXT: amd_code_version_major = 1
+; GCN-NEXT: amd_code_version_minor = 2
+; GCN-NEXT: amd_machine_kind = 1
+; GCN-NEXT: amd_machine_version_major = 6
+; GCN-NEXT: amd_machine_version_minor = 0
+; GCN-NEXT: amd_machine_version_stepping = 0
+; GCN-NEXT: kernel_code_entry_byte_offset = 256
+; GCN-NEXT: kernel_code_prefetch_byte_size = 0
+; GCN-NEXT: granulated_workitem_vgpr_count = 0
+; GCN-NEXT: granulated_wavefront_sgpr_count = 0
+; GCN-NEXT: priority = 0
+; GCN-NEXT: float_mode = 240
+; GCN-NEXT: priv = 0
+; GCN-NEXT: enable_dx10_clamp = 1
+; GCN-NEXT: debug_mode = 0
+; GCN-NEXT: enable_ieee_mode = 1
+; GCN-NEXT: enable_wgp_mode = 0
+; GCN-NEXT: enable_mem_ordered = 0
+; GCN-NEXT: enable_fwd_progress = 0
+; GCN-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
+; GCN-NEXT: user_sgpr_count = 12
+; GCN-NEXT: enable_trap_handler = 0
+; GCN-NEXT: enable_sgpr_workgroup_id_x = 1
+; GCN-NEXT: enable_sgpr_workgroup_id_y = 1
+; GCN-NEXT: enable_sgpr_workgroup_id_z = 1
+; GCN-NEXT: enable_sgpr_workgroup_info = 0
+; GCN-NEXT: enable_vgpr_workitem_id = 2
+; GCN-NEXT: enable_exception_msb = 0
+; GCN-NEXT: granulated_lds_size = 0
+; GCN-NEXT: enable_exception = 0
+; GCN-NEXT: enable_sgpr_private_segment_buffer = 1
+; GCN-NEXT: enable_sgpr_dispatch_ptr = 1
+; GCN-NEXT: enable_sgpr_queue_ptr = 1
+; GCN-NEXT: enable_sgpr_kernarg_segment_ptr = 1
+; GCN-NEXT: enable_sgpr_dispatch_id = 1
+; GCN-NEXT: enable_sgpr_flat_scratch_init = 0
+; GCN-NEXT: enable_sgpr_private_segment_size = 0
+; GCN-NEXT: enable_sgpr_grid_workgroup_count_x = 0
+; GCN-NEXT: enable_sgpr_grid_workgroup_count_y = 0
+; GCN-NEXT: enable_sgpr_grid_workgroup_count_z = 0
+; GCN-NEXT: enable_wavefront_size32 = 0
+; GCN-NEXT: enable_ordered_append_gds = 0
+; GCN-NEXT: private_element_size = 1
+; GCN-NEXT: is_ptr64 = 1
+; GCN-NEXT: is_dynamic_callstack = 0
+; GCN-NEXT: is_debug_enabled = 0
+; GCN-NEXT: is_xnack_enabled = 0
+; GCN-NEXT: workitem_private_segment_byte_size = 0
+; GCN-NEXT: workgroup_group_segment_byte_size = 0
+; GCN-NEXT: gds_segment_byte_size = 0
+; GCN-NEXT: kernarg_segment_byte_size = 16
+; GCN-NEXT: workgroup_fbarrier_count = 0
+; GCN-NEXT: wavefront_sgpr_count = 4
+; GCN-NEXT: workitem_vgpr_count = 2
+; GCN-NEXT: reserved_vgpr_first = 0
+; GCN-NEXT: reserved_vgpr_count = 0
+; GCN-NEXT: reserved_sgpr_first = 0
+; GCN-NEXT: reserved_sgpr_count = 0
+; GCN-NEXT: debug_wavefront_private_segment_offset_sgpr = 0
+; GCN-NEXT: debug_private_segment_buffer_sgpr = 0
+; GCN-NEXT: kernarg_segment_alignment = 4
+; GCN-NEXT: group_segment_alignment = 4
+; GCN-NEXT: private_segment_alignment = 4
+; GCN-NEXT: wavefront_size = 6
+; GCN-NEXT: call_convention = -1
+; GCN-NEXT: runtime_loader_kernel_symbol = 0
+; GCN-NEXT: .end_amd_kernel_code_t
+; GCN-NEXT: ; %bb.0:
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
%val0 = load volatile float, ptr addrspace(1) poison
%val1 = load volatile float, ptr addrspace(1) poison
%min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -30,14 +181,87 @@ define amdgpu_kernel void @kernel_ieee_mode_on() #1 {
ret void
}
-; GCN-LABEL: {{^}}kernel_ieee_mode_off:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
-; GCN-NOT: v_mul_f32
define amdgpu_kernel void @kernel_ieee_mode_off() #2 {
+; GCN-LABEL: kernel_ieee_mode_off:
+; GCN: .amd_kernel_code_t
+; GCN-NEXT: amd_code_version_major = 1
+; GCN-NEXT: amd_code_version_minor = 2
+; GCN-NEXT: amd_machine_kind = 1
+; GCN-NEXT: amd_machine_version_major = 6
+; GCN-NEXT: amd_machine_version_minor = 0
+; GCN-NEXT: amd_machine_version_stepping = 0
+; GCN-NEXT: kernel_code_entry_byte_offset = 256
+; GCN-NEXT: kernel_code_prefetch_byte_size = 0
+; GCN-NEXT: granulated_workitem_vgpr_count = 0
+; GCN-NEXT: granulated_wavefront_sgpr_count = 0
+; GCN-NEXT: priority = 0
+; GCN-NEXT: float_mode = 240
+; GCN-NEXT: priv = 0
+; GCN-NEXT: enable_dx10_clamp = 1
+; GCN-NEXT: debug_mode = 0
+; GCN-NEXT: enable_ieee_mode = 0
+; GCN-NEXT: enable_wgp_mode = 0
+; GCN-NEXT: enable_mem_ordered = 0
+; GCN-NEXT: enable_fwd_progress = 0
+; GCN-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
+; GCN-NEXT: user_sgpr_count = 12
+; GCN-NEXT: enable_trap_handler = 0
+; GCN-NEXT: enable_sgpr_workgroup_id_x = 1
+; GCN-NEXT: enable_sgpr_workgroup_id_y = 1
+; GCN-NEXT: enable_sgpr_workgroup_id_z = 1
+; GCN-NEXT: enable_sgpr_workgroup_info = 0
+; GCN-NEXT: enable_vgpr_workitem_id = 2
+; GCN-NEXT: enable_exception_msb = 0
+; GCN-NEXT: granulated_lds_size = 0
+; GCN-NEXT: enable_exception = 0
+; GCN-NEXT: enable_sgpr_private_segment_buffer = 1
+; GCN-NEXT: enable_sgpr_dispatch_ptr = 1
+; GCN-NEXT: enable_sgpr_queue_ptr = 1
+; GCN-NEXT: enable_sgpr_kernarg_segment_ptr = 1
+; GCN-NEXT: enable_sgpr_dispatch_id = 1
+; GCN-NEXT: enable_sgpr_flat_scratch_init = 0
+; GCN-NEXT: enable_sgpr_private_segment_size = 0
+; GCN-NEXT: enable_sgpr_grid_workgroup_count_x = 0
+; GCN-NEXT: enable_sgpr_grid_workgroup_count_y = 0
+; GCN-NEXT: enable_sgpr_grid_workgroup_count_z = 0
+; GCN-NEXT: enable_wavefront_size32 = 0
+; GCN-NEXT: enable_ordered_append_gds = 0
+; GCN-NEXT: private_element_size = 1
+; GCN-NEXT: is_ptr64 = 1
+; GCN-NEXT: is_dynamic_callstack = 0
+; GCN-NEXT: is_debug_enabled = 0
+; GCN-NEXT: is_xnack_enabled = 0
+; GCN-NEXT: workitem_private_segment_byte_size = 0
+; GCN-NEXT: workgroup_group_segment_byte_size = 0
+; GCN-NEXT: gds_segment_byte_size = 0
+; GCN-NEXT: kernarg_segment_byte_size = 16
+; GCN-NEXT: workgroup_fbarrier_count = 0
+; GCN-NEXT: wavefront_sgpr_count = 4
+; GCN-NEXT: workitem_vgpr_count = 2
+; GCN-NEXT: reserved_vgpr_first = 0
+; GCN-NEXT: reserved_vgpr_count = 0
+; GCN-NEXT: reserved_sgpr_first = 0
+; GCN-NEXT: reserved_sgpr_count = 0
+; GCN-NEXT: debug_wavefront_private_segment_offset_sgpr = 0
+; GCN-NEXT: debug_private_segment_buffer_sgpr = 0
+; GCN-NEXT: kernarg_segment_alignment = 4
+; GCN-NEXT: group_segment_alignment = 4
+; GCN-NEXT: private_segment_alignment = 4
+; GCN-NEXT: wavefront_size = 6
+; GCN-NEXT: call_convention = -1
+; GCN-NEXT: runtime_loader_kernel_symbol = 0
+; GCN-NEXT: .end_amd_kernel_code_t
+; GCN-NEXT: ; %bb.0:
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
%val0 = load volatile float, ptr addrspace(1) poison
%val1 = load volatile float, ptr addrspace(1) poison
%min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -45,14 +269,22 @@ define amdgpu_kernel void @kernel_ieee_mode_off() #2 {
ret void
}
-; GCN-LABEL: {{^}}func_ieee_mode_default:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
define void @func_ieee_mode_default() #0 {
+; GCN-LABEL: func_ieee_mode_default:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dword v1, off, s[4:7], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
%val0 = load volatile float, ptr addrspace(1) poison
%val1 = load volatile float, ptr addrspace(1) poison
%min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -60,14 +292,22 @@ define void @func_ieee_mode_default() #0 {
ret void
}
-; GCN-LABEL: {{^}}func_ieee_mode_on:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
define void @func_ieee_mode_on() #1 {
+; GCN-LABEL: func_ieee_mode_on:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dword v1, off, s[4:7], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
%val0 = load volatile float, ptr addrspace(1) poison
%val1 = load volatile float, ptr addrspace(1) poison
%min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -75,14 +315,20 @@ define void @func_ieee_mode_on() #1 {
ret void
}
-; GCN-LABEL: {{^}}func_ieee_mode_off:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
-; GCN-NOT: v_mul_f32
define void @func_ieee_mode_off() #2 {
+; GCN-LABEL: func_ieee_mode_off:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dword v1, off, s[4:7], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
%val0 = load volatile float, ptr addrspace(1) poison
%val1 = load volatile float, ptr addrspace(1) poison
%min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -90,14 +336,19 @@ define void @func_ieee_mode_off() #2 {
ret void
}
-; GCN-LABEL: {{^}}cs_ieee_mode_default:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
define amdgpu_cs void @cs_ieee_mode_default() #0 {
+; GCN-LABEL: cs_ieee_mode_default:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
%val0 = load volatile float, ptr addrspace(1) poison
%val1 = load volatile float, ptr addrspace(1) poison
%min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -105,14 +356,21 @@ define amdgpu_cs void @cs_ieee_mode_default() #0 {
ret void
}
-; GCN-LABEL: {{^}}cs_ieee_mode_on:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
define amdgpu_cs void @cs_ieee_mode_on() #1 {
+; GCN-LABEL: cs_ieee_mode_on:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
%val0 = load volatile float, ptr addrspace(1) poison
%val1 = load volatile float, ptr addrspace(1) poison
%min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -120,14 +378,19 @@ define amdgpu_cs void @cs_ieee_mode_on() #1 {
ret void
}
-; GCN-LABEL: {{^}}cs_ieee_mode_off:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
-; GCN-NOT: v_mul_f32
define amdgpu_cs void @cs_ieee_mode_off() #2 {
+; GCN-LABEL: cs_ieee_mode_off:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
%val0 = load volatile float, ptr addrspace(1) poison
%val1 = load volatile float, ptr addrspace(1) poison
%min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -135,14 +398,19 @@ define amdgpu_cs void @cs_ieee_mode_off() #2 {
ret void
}
-; GCN-LABEL: {{^}}ps_ieee_mode_default:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
-; GCN-NOT: v_mul_f32
define amdgpu_ps void @ps_ieee_mode_default() #0 {
+; GCN-LABEL: ps_ieee_mode_default:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
%val0 = load volatile float, ptr addrspace(1) poison
%val1 = load volatile float, ptr addrspace(1) poison
%min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -150,14 +418,21 @@ define amdgpu_ps void @ps_ieee_mode_default() #0 {
ret void
}
-; GCN-LABEL: {{^}}ps_ieee_mode_on:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
define amdgpu_ps void @ps_ieee_mode_on() #1 {
+; GCN-LABEL: ps_ieee_mode_on:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
%val0 = load volatile float, ptr addrspace(1) poison
%val1 = load volatile float, ptr addrspace(1) poison
%min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -165,14 +440,19 @@ define amdgpu_ps void @ps_ieee_mode_on() #1 {
ret void
}
-; GCN-LABEL: {{^}}ps_ieee_mode_off:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
-; GCN-NOT: v_mul_f32
define amdgpu_ps void @ps_ieee_mode_off() #2 {
+; GCN-LABEL: ps_ieee_mode_off:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
%val0 = load volatile float, ptr addrspace(1) poison
%val1 = load volatile float, ptr addrspace(1) poison
%min = call float @llvm.minnum.f32(float %val0, float %val1)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index df9c97f..74552a5 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -6551,271 +6551,205 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v66.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v160.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v33.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v35.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v36.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v149.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v149.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v64.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v148.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v37.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v148.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v147.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v146.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v54.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v145.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v144.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v37.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v133.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v119.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v116.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v49.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v114.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v113.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v102.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v99.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v96.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v85.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v82.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v81.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v71.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v69.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v68.l
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v133.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v132.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v131.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v130.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v134.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v9.h, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v5.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v6.h, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v129.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v128.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v119.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v118.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v10.l, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v50.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v117.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v116.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v115.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v49.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.h, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v114.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v113.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v112.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v103.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v13.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v14.l, v14.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v102.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v100.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v99.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.h, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v14.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v15.h, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v16.h, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v98.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v97.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v96.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v87.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v17.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v18.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v19.l, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v25.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v36.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v86.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v26.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v85.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v84.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v35.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.h, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v19.h, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v20.h, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v21.h, v22.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v83.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v82.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v81.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v21.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v22.l, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v23.l, v23.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v24.l, v24.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v71.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v70.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 8, v69.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.h, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v24.h, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v25.h, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v26.h, v27.l
; GFX11-TRUE16-NEXT: s_clause 0x5
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:64
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:80
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off offset:96
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[29:32], off offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v32i32_to_v128i8:
@@ -15709,61 +15643,61 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:368
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:352
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:328
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:320
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:304
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:296
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:288
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:280
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:264
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:240
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:224
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_b32 v114, off, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v113, off, s32 offset:388
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:104
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:112
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128
@@ -15778,121 +15712,123 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:148
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.l, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v18.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v6.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v19.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v21.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v70.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v67.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v66.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v64.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v54.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v53.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v52.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v51.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v50.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v29.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v51.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.l, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v71.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v70.h
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v113
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v83.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v84.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v84.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v85.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v85.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v96.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v96.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v97.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v97.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v98.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v100.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v101.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v102.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v102.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v160.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v161.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v161.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v162.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v162.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v163.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v163.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v164.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v164.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.l, 8, v165.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v80.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v55.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v55.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v52.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v31.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v31.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -15903,215 +15839,179 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_4
; GFX11-TRUE16-NEXT: .LBB14_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB14_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v146.l
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v0.l, v151.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v151.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v0.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v150.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v149, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v1.h, v150.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v145.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v134.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v133.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v133.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v132.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v131.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v148.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v130.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v149, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v2.l, v148.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v134.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v145.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v130.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v149, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v3.l, v147.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v147.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v118.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v149, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v4.l, v144.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v4.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v133.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v117.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v129.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v129.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v128.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v128.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v116.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v116.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v149, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v5.l, v135.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v5.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v132.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v129.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v113.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v112.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v149, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v6.l, v133.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v6.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v128.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v103.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v101.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v149, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v7.l, v131.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v7.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v100.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v99.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v149, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v8.l, v129.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v96.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v149, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v9.l, v128.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v9.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v113.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v86.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v149, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v10.l, v117.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v10.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v102.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v84.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v82.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v112.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v112.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v103.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v100.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v99.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v99.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v87.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v87.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v86.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v83.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v82.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v82.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h
; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v81.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v149, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v11.l, v116.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v11.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v99.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v149, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v12.l, v114.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v12.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v97.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v69.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v68.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v149, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v13.l, v112.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v13.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v87.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v67.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v65.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v149, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v14.l, v102.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v14.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v85.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v83.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v55.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v70.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v69.h
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v68.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v65.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v65.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v64.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v53.l
; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v50.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v149, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v15.l, v100.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v15.l, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v82.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l
; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v149, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v16.l, v98.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v16.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.h
; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h
; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v149, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v17.l, v97.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v17.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v70.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v68.l
; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v38.h
; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v149, v17
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v18.l, v87.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v18.l, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v66.h
; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v37.l
; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v149, v18
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v19.l, v85.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v19.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v64.h
; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v149, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v20.l, v83.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v20.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v55.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v54.l
; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h
; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v149, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v21.l, v81.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v21.l, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v53.l
; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v33.l
; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v149, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v22.l, v71.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v22.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v52.l
; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v150.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v150.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v151.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v151.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v146.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v147.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v147.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v148.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v148.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v134.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v135.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v144.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v144.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v130.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v130.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v131.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v131.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v132.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v117.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v117.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v118.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v119.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v113.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v113.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v114.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v114.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v115.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v100.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v101.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v102.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v102.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v96.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v96.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v97.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v97.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v98.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v83.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v84.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v84.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v85.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v85.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v70.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v71.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v80.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v80.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v66.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v66.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v67.l
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v67.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v68.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v55.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v50.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v30.l, v51.l
+; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v51.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v52.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
@@ -16133,433 +16033,329 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v149, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v23.l, v70.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v23.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v51.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v149, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v24.l, v67.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v24.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v149, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v25.l, v66.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v25.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v149, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v26.l, v64.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v26.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v149, v26
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v27.l, v54.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v27.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v149, v27
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v28.l, v53.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v28.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v149, v28
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v29.l, v52.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v29.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v149, v29
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v30.l, v51.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v149, v30
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v31.l, v50.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v149, v31
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2
; GFX11-TRUE16-NEXT: .LBB14_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v146.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v145.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v133.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v133.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v129.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v129.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v128.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v128.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v116.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v116.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v112.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v103.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v99.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v99.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v87.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v87.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v86.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v83.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v82.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v82.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v81.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v70.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v69.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v68.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v65.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, v65.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v64.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v64.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, v53.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, v50.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, v38.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, v38.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, v34.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, v32.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v151.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v150.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v134.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v31, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v148.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v31, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v31.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v31, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v130.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v130.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v144.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v145.l, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v31, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v118.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v117.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v133.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v133.h, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v31, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v131.h, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v132.l, v6.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v31, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v113.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v129.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v129.h, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v31, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v128.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v128.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v103.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v31, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v101.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v117.h, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v118.l, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v31, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v116.l, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v116.h, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v99.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v31, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v96.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v96.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v31, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.h, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v113.l, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v86.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v31, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v84.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v84.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v102.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v102.h, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v31, v16
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v100.h, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.l, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v82.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v31, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v80.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v80.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v98.h, v15.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v99.l, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v31, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v97.l, v16.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v97.h, v16.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v31, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v17.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v67.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v87.l, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v87.h, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v31, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v85.h, v18.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v65.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v31, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v19.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v55.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v50.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v83.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v83.h, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v31, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v81.h, v20.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v82.l, v20.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v31, v23
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v21.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v48.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.l, v21.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v71.h, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v31, v24
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v22.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v31, v25
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v23.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v38.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v67.h, v23.l
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v68.l, v23.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.l, v24.l
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v66.h, v24.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v25.h
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v64.l, v25.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v64.h, v25.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h
; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v31, v28
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v55.l, v26.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v31, v29
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v27.h
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v53.h, v27.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v54.l, v27.h
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v31, v30
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v34.h, 0x300, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v53.l, v28.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v31, v34
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v29.h
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v32.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v51.h, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v52.l, v29.h
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h
; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l
; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v32.h
-; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v50.h, v30.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v51.l, v30.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v31, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.h, 0x300, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32
+; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v150.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v150.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v151.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v151.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v147.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v148.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v134.h, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v144.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v144.h, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v130.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v130.h, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v131.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v131.h, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v132.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v117.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v117.h, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v118.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v118.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v119.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v113.l, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v113.h, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v115.l, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v100.h, v14.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v101.l, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.h, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v102.l, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v102.h, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v96.l, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v96.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v97.l, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v97.h, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v98.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v83.h, v19.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v84.l, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v84.h, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v85.l, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v85.h, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v70.h, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v71.l, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.h, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v80.l, v23.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v80.h, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v66.l, v24.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.h, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v67.l, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v67.h, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v68.l, v26.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.h, v27.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v54.l, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.h, v28.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v55.l, v28.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.h, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v50.h, v29.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v51.l, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v51.h, v30.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v52.l, v31.l
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v52.h, v31.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v10.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v14.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v16.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v16.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v17.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v18.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v19.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v20.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v20.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v21.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v21.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v22.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v23.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v24.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v25.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v25.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v26.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v26.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v27.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v28.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v30.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v30.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v31.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v31.h
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -29381,870 +29177,1844 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: .LBB19_4:
; GFX9-NEXT: s_branch .LBB19_2
;
-; GFX11-LABEL: bitcast_v64bf16_to_v32i32_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:288
-; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:284
-; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:280
-; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:276
-; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:272
-; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:268
-; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:264
-; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:260
-; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:256
-; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:252
-; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:248
-; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:244
-; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:240
-; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:236
-; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:232
-; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:228
-; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:224
-; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:220
-; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:216
-; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:212
-; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:208
-; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:204
-; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:200
-; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:196
-; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:192
-; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:188
-; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:184
-; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:180
-; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:176
-; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:172
-; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:168
-; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:164
-; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:160
-; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:156
-; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:152
-; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:148
-; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:144
-; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:140
-; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:136
-; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:132
-; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:128
-; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:124
-; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:120
-; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:116
-; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:112
-; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:108
-; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:104
-; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:100
-; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:96
-; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:92
-; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:88
-; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:84
-; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:80
-; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:76
-; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:72
-; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:68
-; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:64
-; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:60
-; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:56
-; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:52
-; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:48
-; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:44
-; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:40
-; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:36
-; GFX11-NEXT: s_clause 0x8
-; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:32
-; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:28
-; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:24
-; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:20
-; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:16
-; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:12
-; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:8
-; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:4
-; GFX11-NEXT: scratch_store_b32 off, v184, s32
-; GFX11-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12
-; GFX11-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9
-; GFX11-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7
-; GFX11-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3
-; GFX11-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4
-; GFX11-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0
-; GFX11-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1
-; GFX11-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB19_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2
-; GFX11-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3
-; GFX11-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18
-; GFX11-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19
-; GFX11-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22
-; GFX11-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23
-; GFX11-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26
-; GFX11-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB19_3
-; GFX11-NEXT: .LBB19_2: ; %cmp.true
-; GFX11-NEXT: s_and_b32 s5, s27, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s4, s27, 16
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s5
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s6, s26, 16
-; GFX11-NEXT: s_and_b32 s4, s26, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1
-; GFX11-NEXT: s_lshl_b32 s7, s25, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: s_and_b32 s5, s25, 0xffff0000
-; GFX11-NEXT: s_and_b32 s4, s24, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s5
-; GFX11-NEXT: v_and_b32_e32 v51, 0xffff0000, v183
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s7
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_bfe_u32 v3, v8, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v3, v8
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v10, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: s_lshl_b32 s4, s24, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v6
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v8
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s4
-; GFX11-NEXT: s_and_b32 s4, s23, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2
-; GFX11-NEXT: v_bfe_u32 v7, v9, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v7, v9
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s4, s23, 16
-; GFX11-NEXT: v_lshl_or_b32 v151, v0, 16, v1
-; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT: v_bfe_u32 v11, v7, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: s_and_b32 s4, s22, 0xffff0000
-; GFX11-NEXT: v_bfe_u32 v9, v12, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v11, v7
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s4, s22, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s4
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v9, v12
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v7
-; GFX11-NEXT: v_bfe_u32 v14, v10, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
-; GFX11-NEXT: s_and_b32 s4, s21, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v9, v11, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v14, v10
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v7
-; GFX11-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v10
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: s_lshl_b32 s4, s21, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v11
-; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s4
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v10, v13, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX11-NEXT: v_bfe_u32 v12, v16, 16, 1
-; GFX11-NEXT: s_and_b32 s4, s20, 0xffff0000
-; GFX11-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v9
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v11, v12, v16
-; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: s_lshl_b32 s4, s20, 16
-; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v16
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
-; GFX11-NEXT: v_bfe_u32 v18, v12, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4
-; GFX11-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT: s_and_b32 s4, s19, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v14, v18, v12
-; GFX11-NEXT: v_bfe_u32 v16, v19, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v9
-; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s4, s19, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v14, v16, v19
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v12
-; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s4
-; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
-; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v19
-; GFX11-NEXT: s_and_b32 s4, s18, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v16, v18, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT: v_add_nc_u32_e32 v19, v21, v17
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v13
-; GFX11-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19
-; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v17
-; GFX11-NEXT: v_add_f32_e64 v20, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: s_lshl_b32 s4, s18, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
-; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v18
-; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s4
-; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v17, v20, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX11-NEXT: v_bfe_u32 v19, v22, 16, 1
-; GFX11-NEXT: s_and_b32 s4, s17, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v17, v17, v20
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v16
-; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v20
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v18, v19, v22
-; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-NEXT: s_lshl_b32 s4, s17, 16
-; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v22
-; GFX11-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18
-; GFX11-NEXT: v_bfe_u32 v24, v19, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s4
-; GFX11-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-NEXT: s_and_b32 s4, s16, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v21, v24, v19
-; GFX11-NEXT: v_bfe_u32 v22, v25, 16, 1
-; GFX11-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16
-; GFX11-NEXT: v_add_f32_e64 v23, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s4, s16, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v20
-; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21
-; GFX11-NEXT: v_add_nc_u32_e32 v21, v22, v25
-; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19
-; GFX11-NEXT: v_add_f32_e64 v24, 0x40c00000, s4
-; GFX11-NEXT: v_bfe_u32 v27, v23, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21
-; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v25
-; GFX11-NEXT: s_and_b32 s4, s3, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v22, v24, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT: v_add_nc_u32_e32 v25, v27, v23
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v20
-; GFX11-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25
-; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v23
-; GFX11-NEXT: v_add_f32_e64 v26, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20
-; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v24
-; GFX11-NEXT: v_add_f32_e64 v28, 0x40c00000, s3
-; GFX11-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v23, v26, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v21
-; GFX11-NEXT: v_bfe_u32 v25, v28, 16, 1
-; GFX11-NEXT: s_and_b32 s3, s2, 0xffff0000
-; GFX11-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v22
-; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v26
-; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v24
-; GFX11-NEXT: v_add_nc_u32_e32 v24, v25, v28
-; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-NEXT: v_or_b32_e32 v29, 0x400000, v28
-; GFX11-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24
-; GFX11-NEXT: v_bfe_u32 v30, v25, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v31, 0x40c00000, s2
-; GFX11-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-NEXT: s_and_b32 s2, s1, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v27, v30, v25
-; GFX11-NEXT: v_bfe_u32 v28, v31, 16, 1
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v29, 0x40c00000, s2
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v26
-; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27
-; GFX11-NEXT: v_add_nc_u32_e32 v27, v28, v31
-; GFX11-NEXT: v_or_b32_e32 v28, 0x400000, v25
-; GFX11-NEXT: v_add_f32_e64 v30, 0x40c00000, s1
-; GFX11-NEXT: v_bfe_u32 v33, v29, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v24
-; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27
-; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v31
-; GFX11-NEXT: s_and_b32 s1, s0, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v28, v30, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v33, v29
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v26
-; GFX11-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31
-; GFX11-NEXT: v_or_b32_e32 v31, 0x400000, v29
-; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26
-; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v30
-; GFX11-NEXT: v_add_f32_e64 v34, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v29, v32, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v32
-; GFX11-NEXT: v_bfe_u32 v31, v34, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v27
-; GFX11-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v28
-; GFX11-NEXT: v_add_nc_u32_e32 v28, v29, v32
-; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v178
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v30
-; GFX11-NEXT: v_add_nc_u32_e32 v30, v31, v34
-; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v178
-; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28
-; GFX11-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; GFX11-NEXT: v_lshl_or_b32 v109, v5, 16, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30
-; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
-; GFX11-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v29
-; GFX11-NEXT: v_bfe_u32 v35, v31, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v33
-; GFX11-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v179
-; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v31
-; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v179
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v31
-; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v180
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v34, v36, 16, 1
-; GFX11-NEXT: v_bfe_u32 v33, v35, 16, 1
-; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v180
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v33, v33, v35
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32
-; GFX11-NEXT: v_bfe_u32 v36, v37, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_bfe_u32 v35, v38, 16, 1
-; GFX11-NEXT: v_lshl_or_b32 v178, v31, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v36, v37
-; GFX11-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v38
-; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v182
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37
-; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT: v_lshl_or_b32 v179, v32, 16, v34
-; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30
-; GFX11-NEXT: v_lshl_or_b32 v136, v2, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_bfe_u32 v37, v36, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v36
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v181
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38
-; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v181
-; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33
-; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1
-; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v180, v31, 16, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v37
-; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v170
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v34, v36, v38
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37
-; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v39, v36, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_lshl_or_b32 v182, v31, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v37, v38, v35
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v39, v36
-; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v36
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33
-; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v169
-; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v169
-; GFX11-NEXT: v_lshl_or_b32 v181, v32, 16, v33
-; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-NEXT: v_and_b32_e32 v38, 0xffff0000, v176
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo
-; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v39
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v37
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v34
-; GFX11-NEXT: v_bfe_u32 v32, v36, 16, 1
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v37
-; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v176
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v36
-; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
-; GFX11-NEXT: v_bfe_u32 v37, v38, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v49, v35, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT: v_lshl_or_b32 v170, v33, 16, v31
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v36, v49, v35
-; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v174
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36
-; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36
-; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v174
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v37, v35, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39
-; GFX11-NEXT: v_lshl_or_b32 v169, v31, 16, v32
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v37, v37, v35
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v31, v36, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT: v_and_b32_e32 v32, 0xffff0000, v171
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v177
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v36
-; GFX11-NEXT: v_lshl_or_b32 v176, v33, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37
-; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v35
-; GFX11-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171
-; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v36
-; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37
-; GFX11-NEXT: v_bfe_u32 v37, v32, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v32
-; GFX11-NEXT: v_bfe_u32 v50, v38, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32
-; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v177
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: v_bfe_u32 v49, v37, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v36, v50, v38
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v184
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37
-; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_and_b32_e32 v48, 0xffff0000, v184
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35
-; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v37
-; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v50
-; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT: v_bfe_u32 v48, v37, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v39, v38, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35
-; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36
-; GFX11-NEXT: v_lshl_or_b32 v174, v33, 16, v31
-; GFX11-NEXT: v_lshl_or_b32 v171, v32, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v48, v37
-; GFX11-NEXT: v_and_b32_e32 v33, 0xffff0000, v175
-; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v175
-; GFX11-NEXT: v_add_nc_u32_e32 v39, v39, v38
-; GFX11-NEXT: v_lshl_or_b32 v177, v35, 16, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v37
-; GFX11-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39
-; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v38
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1
-; GFX11-NEXT: v_bfe_u32 v39, v34, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v173
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v173
-; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v33
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v36, v37, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v37, v39, v34
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v34
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v35
-; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT: v_lshl_or_b32 v122, v3, 16, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v48
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v35
-; GFX11-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v38
-; GFX11-NEXT: v_add_f32_e32 v48, 0x40c00000, v48
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v172
-; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v172
-; GFX11-NEXT: v_add_nc_u32_e32 v36, v36, v38
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_or_b32_e32 v55, 0x400000, v48
-; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-NEXT: v_add_f32_e32 v39, 0x40c00000, v39
-; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT: v_bfe_u32 v50, v37, 16, 1
-; GFX11-NEXT: v_bfe_u32 v38, v39, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v39
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
-; GFX11-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37
-; GFX11-NEXT: v_bfe_u32 v51, v48, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v39
-; GFX11-NEXT: v_or_b32_e32 v53, 0x400000, v37
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49
-; GFX11-NEXT: v_bfe_u32 v52, v50, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v51, v51, v48
-; GFX11-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35
-; GFX11-NEXT: v_add_nc_u32_e32 v52, v52, v50
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
-; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52
-; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v50
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v38
-; GFX11-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_lshl_or_b32 v184, v32, 16, v31
-; GFX11-NEXT: v_lshl_or_b32 v175, v33, 16, v34
-; GFX11-NEXT: v_and_b32_e32 v38, 0xffff, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v48
-; GFX11-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX11-NEXT: v_lshl_or_b32 v173, v35, 16, v36
-; GFX11-NEXT: v_lshl_or_b32 v97, v8, 16, v10
-; GFX11-NEXT: v_and_b32_e32 v48, 0xffff, v48
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v37
-; GFX11-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v86, v9, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v76, v11, 16, v13
-; GFX11-NEXT: v_lshl_or_b32 v67, v14, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v172, v37, 16, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v39
-; GFX11-NEXT: v_lshl_or_b32 v59, v16, 16, v19
-; GFX11-NEXT: v_lshl_or_b32 v52, v18, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v46, v21, 16, v23
-; GFX11-NEXT: v_lshl_or_b32 v41, v22, 16, v25
-; GFX11-NEXT: v_lshl_or_b32 v183, v39, 16, v48
-; GFX11-NEXT: v_lshl_or_b32 v37, v24, 16, v27
-; GFX11-NEXT: v_lshl_or_b32 v34, v26, 16, v28
-; GFX11-NEXT: v_lshl_or_b32 v32, v29, 16, v30
-; GFX11-NEXT: .LBB19_3: ; %end
-; GFX11-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46
-; GFX11-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86
-; GFX11-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76
-; GFX11-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136
-; GFX11-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122
-; GFX11-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172
-; GFX11-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175
-; GFX11-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
-; GFX11-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
-; GFX11-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
-; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: scratch_load_b32 v184, off, s32
-; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:4
-; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:8
-; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:12
-; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:16
-; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:20
-; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:24
-; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:28
-; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:32
-; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:36
-; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:40
-; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:44
-; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:48
-; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:52
-; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:56
-; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:60
-; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:64
-; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:68
-; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:72
-; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:76
-; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:80
-; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:84
-; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:88
-; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:92
-; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:96
-; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:100
-; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:104
-; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:108
-; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:112
-; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:116
-; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:120
-; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:124
-; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:128
-; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:132
-; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:136
-; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:140
-; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:144
-; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:148
-; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:152
-; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:156
-; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:160
-; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:164
-; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:168
-; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:172
-; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:176
-; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:180
-; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:184
-; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:188
-; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:192
-; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:196
-; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:200
-; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:204
-; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:208
-; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:212
-; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:216
-; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:220
-; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:224
-; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:228
-; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:232
-; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:236
-; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:240
-; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:244
-; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:248
-; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:252
-; GFX11-NEXT: s_clause 0x8
-; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:256
-; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:260
-; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:264
-; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:268
-; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:272
-; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:276
-; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:280
-; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:284
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:288
-; GFX11-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34
-; GFX11-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52
-; GFX11-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177
-; GFX11-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181
-; GFX11-NEXT: v_mov_b32_e32 v28, v182
-; GFX11-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB19_4:
-; GFX11-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
-; GFX11-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73
-; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78
-; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84
-; GFX11-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91
-; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99
-; GFX11-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108
-; GFX11-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118
-; GFX11-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129
-; GFX11-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141
-; GFX11-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154
-; GFX11-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168
-; GFX11-NEXT: s_branch .LBB19_2
+; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v32i32_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:156
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:28
+; GFX11-TRUE16-NEXT: s_clause 0x6
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v167, v13 :: v_dual_mov_b32 v176, v12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v177, v11 :: v_dual_mov_b32 v178, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v179, v9 :: v_dual_mov_b32 v180, v8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v181, v7 :: v_dual_mov_b32 v182, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v183, v5 :: v_dual_mov_b32 v168, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v169, v3 :: v_dual_mov_b32 v170, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v171, v1 :: v_dual_mov_b32 v172, v0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v174, s28 :: v_dual_mov_b32 v173, s29
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v135, s0 :: v_dual_mov_b32 v134, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v132, s2 :: v_dual_mov_b32 v129, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v125, s16 :: v_dual_mov_b32 v120, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v114, s18 :: v_dual_mov_b32 v107, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v99, s20 :: v_dual_mov_b32 v90, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v80, s22 :: v_dual_mov_b32 v69, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v57, s24 :: v_dual_mov_b32 v44, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_3
+; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s27, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s27, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s26, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s25, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s25, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xffff0000
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v8 :: v_dual_add_nc_u32 v7, v7, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s24, 16
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v7, v2 :: v_dual_add_nc_u32 v7, v8, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v9, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v3, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v1.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 16, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.h, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s23, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s22, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s22, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s21, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s20, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v90.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s20, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s19, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s18, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v107.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s17, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s17, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v120, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v120.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s16, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s3, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v125, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v125.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_and_b32 s3, s2, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s1, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v167
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v167
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v167.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v176
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v176
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v176.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v177
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v177
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v177.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v178
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v178
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v178, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v178.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v179
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v179
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v179, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v180
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v180
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v180.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v181
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v181
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v181.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v182
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v182
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v182.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v183
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v183
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v168
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v168
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v168, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v168.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v169
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v169
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v169, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v169.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v170
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v170
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v170, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v170.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v171
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v171
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v171, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v171.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v172
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v172
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v172, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v172.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v173
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v173
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v173, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v173.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v174
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v174
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v174, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v174.h, v0.l
+; GFX11-TRUE16-NEXT: .LBB19_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v125 :: v_dual_mov_b32 v5, v120
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v114 :: v_dual_mov_b32 v7, v107
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v99 :: v_dual_mov_b32 v9, v90
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, v57 :: v_dual_mov_b32 v13, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v30 :: v_dual_mov_b32 v17, v173
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v174 :: v_dual_mov_b32 v19, v171
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0x6
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:280
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v135 :: v_dual_mov_b32 v1, v134
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v132 :: v_dual_mov_b32 v3, v129
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, v80 :: v_dual_mov_b32 v11, v69
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v182 :: v_dual_mov_b32 v27, v179
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v180 :: v_dual_mov_b32 v29, v177
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v178 :: v_dual_mov_b32 v31, v167
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v30, v176
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB19_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166
+; GFX11-TRUE16-NEXT: s_branch .LBB19_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v32i32_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:276
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:272
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:268
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:264
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:260
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:256
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:252
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:248
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:244
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:240
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:236
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:232
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:228
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:224
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:220
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:216
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:212
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:208
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:204
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:200
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:196
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:192
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:188
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:184
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:180
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:176
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164
+; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:148
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:144
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:140
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:136
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:128
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:124
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:120
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:116
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:108
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:104
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:96
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:92
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:88
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v139, s32 offset:84
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v140, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v141, s32 offset:76
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v142, s32 offset:72
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v143, s32 offset:68
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v152, s32 offset:64
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v153, s32 offset:60
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v154, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v155, s32 offset:52
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v156, s32 offset:48
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36
+; GFX11-FAKE16-NEXT: s_clause 0x8
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v171, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v172, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v173, s32 offset:12
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v174, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v175, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v184, s32
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB19_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB19_3
+; GFX11-FAKE16-NEXT: .LBB19_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s27, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s27, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s26, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s25, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s25, 0xffff0000
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v183
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v9, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v3, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v10, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s24, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s23, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v7, v9
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v151, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s22, 0xffff0000
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v12, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v11, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s22, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v9, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s21, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v14, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v16, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s20, 0xffff0000
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v12, v16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s20, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-FAKE16-NEXT: v_bfe_u32 v18, v12, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s19, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v18, v12
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v19, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s19, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v16, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v21, v17, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v19
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s18, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v18, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, v21, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v13
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v20, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s18, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v18
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v22, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v20, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v22, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s17, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, v17, v20
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v20
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, v19, v22
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s17, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v22
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18
+; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v19, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v24, v19
+; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v25, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v23, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s16, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v20
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v22, v25
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v19
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v24, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v27, v23, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v25
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s3, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v24, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, v27, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v20
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v23
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v26, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v24
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v28, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v26, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT: v_bfe_u32 v25, v28, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s3, s2, 0xffff0000
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v22
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v26
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v24
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, v25, v28
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v28
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24
+; GFX11-FAKE16-NEXT: v_bfe_u32 v30, v25, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v31, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s1, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v30, v25
+; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v31, 16, 1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v29, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v26
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v28, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v25
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v30, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v29, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, 0x400000, v31
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s0, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v30, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v33, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v26
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, 0x400000, v29
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v32, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, 0x400000, v30
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v34, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v29, v32, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v34, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v28
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, v29, v32
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v178
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v30
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, v31, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v178
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v109, v5, 16, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v29
+; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v31, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v33
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v179
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v179
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v31
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v180
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v180
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v33, v35
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v178, v31, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v36, v37
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v182
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v179, v32, 16, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v136, v2, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v181
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v181
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33
+; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v180, v31, 16, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v170
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v36, v38
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v182, v31, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v38, v35
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v39, v36
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v169
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v169
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v181, v32, 16, v33
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v176
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v34
+; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v37
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v176
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v32, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v170, v33, 16, v31
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v49, v35
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v174
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v174
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v169, v31, 16, v32
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v37, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v171
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v177
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v31, v36
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v176, v33, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v32, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v177
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v50, v38
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v184
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v184
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v50
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v37, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v174, v33, 16, v31
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v171, v32, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v48, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v175
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v175
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, v39, v38
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v177, v35, 16, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v38
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v34, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v173
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v173
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v33
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v37, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v39, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v34
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v122, v3, 16, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v48
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v48
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v172
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v172
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v36, v38
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v48
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v39
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v39, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v39
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37
+; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v48, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v39
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v37
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49
+; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v50, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v51, v51, v48
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v52, v52, v50
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v50
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v38
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v184, v32, 16, v31
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v175, v33, 16, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v173, v35, 16, v36
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v97, v8, 16, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v48
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v37
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v86, v9, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v76, v11, 16, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v67, v14, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v172, v37, 16, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v39
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v59, v16, 16, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v52, v18, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v46, v21, 16, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v41, v22, 16, v25
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v183, v39, 16, v48
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v24, 16, v27
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v26, 16, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v29, 16, v30
+; GFX11-FAKE16-NEXT: .LBB19_3: ; %end
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
+; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v173, off, s32 offset:12
+; GFX11-FAKE16-NEXT: scratch_load_b32 v172, off, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_load_b32 v171, off, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_load_b32 v170, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_b32 v169, off, s32 offset:28
+; GFX11-FAKE16-NEXT: scratch_load_b32 v168, off, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_load_b32 v159, off, s32 offset:36
+; GFX11-FAKE16-NEXT: scratch_load_b32 v158, off, s32 offset:40
+; GFX11-FAKE16-NEXT: scratch_load_b32 v157, off, s32 offset:44
+; GFX11-FAKE16-NEXT: scratch_load_b32 v156, off, s32 offset:48
+; GFX11-FAKE16-NEXT: scratch_load_b32 v155, off, s32 offset:52
+; GFX11-FAKE16-NEXT: scratch_load_b32 v154, off, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_load_b32 v153, off, s32 offset:60
+; GFX11-FAKE16-NEXT: scratch_load_b32 v152, off, s32 offset:64
+; GFX11-FAKE16-NEXT: scratch_load_b32 v143, off, s32 offset:68
+; GFX11-FAKE16-NEXT: scratch_load_b32 v142, off, s32 offset:72
+; GFX11-FAKE16-NEXT: scratch_load_b32 v141, off, s32 offset:76
+; GFX11-FAKE16-NEXT: scratch_load_b32 v140, off, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_load_b32 v139, off, s32 offset:84
+; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:88
+; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:92
+; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:96
+; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:104
+; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:108
+; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116
+; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120
+; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124
+; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128
+; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136
+; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:140
+; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:144
+; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:148
+; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:152
+; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:156
+; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:160
+; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:164
+; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:168
+; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:172
+; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:176
+; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:180
+; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:184
+; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:188
+; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:192
+; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:196
+; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:200
+; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:204
+; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:208
+; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:212
+; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:216
+; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:220
+; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:224
+; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:228
+; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:232
+; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:236
+; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:240
+; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244
+; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248
+; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252
+; GFX11-FAKE16-NEXT: s_clause 0x8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256
+; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260
+; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264
+; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:268
+; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:272
+; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:276
+; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:280
+; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:284
+; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:288
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v28, v182
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB19_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168
+; GFX11-FAKE16-NEXT: s_branch .LBB19_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -42692,271 +43462,205 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v66.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v160.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v33.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v35.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v36.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v149.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v149.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v64.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v148.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v37.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v148.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v147.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v146.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v54.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v145.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v144.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v37.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v133.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v119.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v116.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v49.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v114.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v113.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v102.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v99.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v96.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v85.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v82.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v81.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v71.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v69.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v68.l
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v133.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v132.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v131.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v130.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v134.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v9.h, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v5.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v6.h, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v129.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v128.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v119.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v118.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v10.l, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v50.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v117.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v116.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v115.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v49.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.h, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v114.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v113.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v112.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v103.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v13.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v14.l, v14.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v102.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v100.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v99.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.h, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v14.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v15.h, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v16.h, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v98.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v97.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v96.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v87.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v17.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v18.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v19.l, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v25.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v36.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v86.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v26.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v85.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v84.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v35.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.h, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v19.h, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v20.h, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v21.h, v22.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v83.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v82.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v81.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v21.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v22.l, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v23.l, v23.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v24.l, v24.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v71.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v70.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 8, v69.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.h, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v24.h, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v25.h, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v26.h, v27.l
; GFX11-TRUE16-NEXT: s_clause 0x5
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:64
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:80
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off offset:96
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[29:32], off offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v32f32_to_v128i8:
@@ -53003,61 +53707,61 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:368
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:352
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:328
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:320
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:304
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:296
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:288
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:280
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:264
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:240
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:224
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_b32 v114, off, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v113, off, s32 offset:388
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:104
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:112
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128
@@ -53072,121 +53776,123 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:148
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.l, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v18.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v6.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v19.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v21.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v70.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v67.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v66.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v64.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v54.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v53.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v52.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v51.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v50.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v29.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v51.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.l, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v71.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v70.h
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v113
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v83.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v84.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v84.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v85.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v85.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v96.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v96.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v97.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v97.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v98.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v100.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v101.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v102.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v102.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v160.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v161.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v161.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v162.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v162.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v163.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v163.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v164.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v164.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.l, 8, v165.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v80.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v55.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v55.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v52.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v31.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v31.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -53197,215 +53903,179 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB38_4
; GFX11-TRUE16-NEXT: .LBB38_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB38_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v146.l
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v0.l, v151.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v151.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v0.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v150.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v149, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v1.h, v150.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v145.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v134.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v133.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v133.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v132.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v131.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v148.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v130.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v149, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v2.l, v148.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v134.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v145.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v130.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v149, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v3.l, v147.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v147.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v118.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v149, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v4.l, v144.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v4.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v133.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v117.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v129.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v129.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v128.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v128.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v116.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v116.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v149, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v5.l, v135.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v5.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v132.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v129.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v113.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v112.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v149, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v6.l, v133.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v6.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v128.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v103.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v101.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v149, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v7.l, v131.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v7.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v100.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v99.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v149, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v8.l, v129.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v96.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v149, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v9.l, v128.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v9.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v113.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v86.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v149, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v10.l, v117.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v10.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v102.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v84.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v82.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v112.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v112.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v103.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v100.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v99.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v99.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v87.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v87.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v86.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v83.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v82.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v82.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h
; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v81.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v149, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v11.l, v116.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v11.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v99.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v149, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v12.l, v114.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v12.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v97.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v69.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v68.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v149, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v13.l, v112.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v13.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v87.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v67.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v65.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v149, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v14.l, v102.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v14.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v85.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v83.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v55.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v70.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v69.h
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v68.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v65.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v65.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v64.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v53.l
; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v50.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v149, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v15.l, v100.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v15.l, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v82.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l
; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v149, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v16.l, v98.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v16.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.h
; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h
; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v149, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v17.l, v97.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v17.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v70.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v68.l
; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v38.h
; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v149, v17
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v18.l, v87.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v18.l, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v66.h
; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v37.l
; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v149, v18
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v19.l, v85.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v19.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v64.h
; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v149, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v20.l, v83.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v20.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v55.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v54.l
; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h
; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v149, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v21.l, v81.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v21.l, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v53.l
; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v33.l
; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v149, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v22.l, v71.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v22.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v52.l
; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v150.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v150.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v151.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v151.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v146.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v147.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v147.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v148.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v148.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v134.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v135.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v144.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v144.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v130.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v130.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v131.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v131.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v132.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v117.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v117.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v118.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v119.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v113.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v113.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v114.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v114.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v115.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v100.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v101.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v102.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v102.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v96.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v96.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v97.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v97.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v98.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v83.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v84.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v84.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v85.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v85.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v70.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v71.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v80.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v80.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v66.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v66.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v67.l
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v67.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v68.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v55.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v50.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v30.l, v51.l
+; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v51.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v52.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
@@ -53427,433 +54097,329 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v149, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v23.l, v70.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v23.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v51.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v149, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v24.l, v67.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v24.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v149, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v25.l, v66.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v25.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v149, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v26.l, v64.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v26.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v149, v26
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v27.l, v54.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v27.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v149, v27
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v28.l, v53.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v28.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v149, v28
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v29.l, v52.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v29.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v149, v29
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v30.l, v51.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v149, v30
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v31.l, v50.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v149, v31
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB38_2
; GFX11-TRUE16-NEXT: .LBB38_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v146.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v145.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v133.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v133.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v129.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v129.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v128.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v128.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v116.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v116.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v112.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v103.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v99.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v99.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v87.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v87.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v86.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v83.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v82.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v82.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v81.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v70.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v69.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v68.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v65.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, v65.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v64.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v64.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, v53.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, v50.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, v38.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, v38.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, v34.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, v32.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v151.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v150.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v134.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v31, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v148.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v31, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v31.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v31, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v130.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v130.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v144.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v145.l, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v31, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v118.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v117.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v133.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v133.h, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v31, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v131.h, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v132.l, v6.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v31, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v113.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v129.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v129.h, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v31, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v128.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v128.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v103.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v31, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v101.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v117.h, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v118.l, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v31, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v116.l, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v116.h, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v99.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v31, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v96.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v96.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v31, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.h, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v113.l, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v86.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v31, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v84.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v84.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v102.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v102.h, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v31, v16
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v100.h, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.l, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v82.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v31, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v80.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v80.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v98.h, v15.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v99.l, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v31, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v97.l, v16.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v97.h, v16.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v31, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v17.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v67.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v87.l, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v87.h, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v31, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v85.h, v18.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v65.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v31, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v19.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v55.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v50.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v83.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v83.h, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v31, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v81.h, v20.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v82.l, v20.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v31, v23
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v21.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v48.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.l, v21.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v71.h, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v31, v24
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v22.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v31, v25
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v23.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v38.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v67.h, v23.l
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v68.l, v23.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.l, v24.l
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v66.h, v24.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v25.h
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v64.l, v25.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v64.h, v25.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h
; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v31, v28
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v55.l, v26.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v31, v29
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v27.h
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v53.h, v27.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v54.l, v27.h
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v31, v30
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v34.h, 0x300, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v53.l, v28.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v31, v34
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v29.h
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v32.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v51.h, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v52.l, v29.h
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h
; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l
; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v32.h
-; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v50.h, v30.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v51.l, v30.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v31, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.h, 0x300, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32
+; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v150.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v150.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v151.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v151.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v147.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v148.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v134.h, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v144.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v144.h, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v130.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v130.h, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v131.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v131.h, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v132.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v117.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v117.h, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v118.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v118.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v119.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v113.l, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v113.h, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v115.l, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v100.h, v14.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v101.l, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.h, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v102.l, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v102.h, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v96.l, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v96.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v97.l, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v97.h, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v98.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v83.h, v19.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v84.l, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v84.h, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v85.l, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v85.h, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v70.h, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v71.l, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.h, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v80.l, v23.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v80.h, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v66.l, v24.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.h, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v67.l, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v67.h, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v68.l, v26.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.h, v27.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v54.l, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.h, v28.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v55.l, v28.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.h, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v50.h, v29.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v51.l, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v51.h, v30.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v52.l, v31.l
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v52.h, v31.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v10.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v14.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v16.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v16.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v17.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v18.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v19.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v20.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v20.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v21.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v21.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v22.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v23.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v24.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v25.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v25.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v26.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v26.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v27.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v28.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v30.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v30.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v31.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v31.h
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -66721,870 +67287,1844 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
; GFX9-NEXT: .LBB43_4:
; GFX9-NEXT: s_branch .LBB43_2
;
-; GFX11-LABEL: bitcast_v64bf16_to_v32f32_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:288
-; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:284
-; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:280
-; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:276
-; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:272
-; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:268
-; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:264
-; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:260
-; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:256
-; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:252
-; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:248
-; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:244
-; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:240
-; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:236
-; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:232
-; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:228
-; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:224
-; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:220
-; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:216
-; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:212
-; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:208
-; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:204
-; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:200
-; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:196
-; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:192
-; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:188
-; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:184
-; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:180
-; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:176
-; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:172
-; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:168
-; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:164
-; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:160
-; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:156
-; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:152
-; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:148
-; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:144
-; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:140
-; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:136
-; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:132
-; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:128
-; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:124
-; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:120
-; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:116
-; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:112
-; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:108
-; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:104
-; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:100
-; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:96
-; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:92
-; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:88
-; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:84
-; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:80
-; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:76
-; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:72
-; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:68
-; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:64
-; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:60
-; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:56
-; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:52
-; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:48
-; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:44
-; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:40
-; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:36
-; GFX11-NEXT: s_clause 0x8
-; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:32
-; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:28
-; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:24
-; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:20
-; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:16
-; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:12
-; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:8
-; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:4
-; GFX11-NEXT: scratch_store_b32 off, v184, s32
-; GFX11-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12
-; GFX11-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9
-; GFX11-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7
-; GFX11-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3
-; GFX11-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4
-; GFX11-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0
-; GFX11-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1
-; GFX11-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB43_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2
-; GFX11-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3
-; GFX11-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18
-; GFX11-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19
-; GFX11-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22
-; GFX11-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23
-; GFX11-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26
-; GFX11-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB43_3
-; GFX11-NEXT: .LBB43_2: ; %cmp.true
-; GFX11-NEXT: s_and_b32 s5, s27, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s4, s27, 16
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s5
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s6, s26, 16
-; GFX11-NEXT: s_and_b32 s4, s26, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1
-; GFX11-NEXT: s_lshl_b32 s7, s25, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: s_and_b32 s5, s25, 0xffff0000
-; GFX11-NEXT: s_and_b32 s4, s24, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s5
-; GFX11-NEXT: v_and_b32_e32 v51, 0xffff0000, v183
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s7
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_bfe_u32 v3, v8, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v3, v8
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v10, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: s_lshl_b32 s4, s24, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v6
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v8
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s4
-; GFX11-NEXT: s_and_b32 s4, s23, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2
-; GFX11-NEXT: v_bfe_u32 v7, v9, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v7, v9
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s4, s23, 16
-; GFX11-NEXT: v_lshl_or_b32 v151, v0, 16, v1
-; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT: v_bfe_u32 v11, v7, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: s_and_b32 s4, s22, 0xffff0000
-; GFX11-NEXT: v_bfe_u32 v9, v12, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v11, v7
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s4, s22, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s4
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v9, v12
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v7
-; GFX11-NEXT: v_bfe_u32 v14, v10, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
-; GFX11-NEXT: s_and_b32 s4, s21, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v9, v11, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v14, v10
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v7
-; GFX11-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v10
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: s_lshl_b32 s4, s21, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v11
-; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s4
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v10, v13, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX11-NEXT: v_bfe_u32 v12, v16, 16, 1
-; GFX11-NEXT: s_and_b32 s4, s20, 0xffff0000
-; GFX11-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v9
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v11, v12, v16
-; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: s_lshl_b32 s4, s20, 16
-; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v16
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
-; GFX11-NEXT: v_bfe_u32 v18, v12, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4
-; GFX11-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT: s_and_b32 s4, s19, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v14, v18, v12
-; GFX11-NEXT: v_bfe_u32 v16, v19, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v9
-; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s4, s19, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v14, v16, v19
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v12
-; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s4
-; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
-; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v19
-; GFX11-NEXT: s_and_b32 s4, s18, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v16, v18, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT: v_add_nc_u32_e32 v19, v21, v17
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v13
-; GFX11-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19
-; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v17
-; GFX11-NEXT: v_add_f32_e64 v20, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: s_lshl_b32 s4, s18, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
-; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v18
-; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s4
-; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v17, v20, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX11-NEXT: v_bfe_u32 v19, v22, 16, 1
-; GFX11-NEXT: s_and_b32 s4, s17, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v17, v17, v20
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v16
-; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v20
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v18, v19, v22
-; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-NEXT: s_lshl_b32 s4, s17, 16
-; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v22
-; GFX11-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18
-; GFX11-NEXT: v_bfe_u32 v24, v19, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s4
-; GFX11-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-NEXT: s_and_b32 s4, s16, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v21, v24, v19
-; GFX11-NEXT: v_bfe_u32 v22, v25, 16, 1
-; GFX11-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16
-; GFX11-NEXT: v_add_f32_e64 v23, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s4, s16, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v20
-; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21
-; GFX11-NEXT: v_add_nc_u32_e32 v21, v22, v25
-; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19
-; GFX11-NEXT: v_add_f32_e64 v24, 0x40c00000, s4
-; GFX11-NEXT: v_bfe_u32 v27, v23, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21
-; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v25
-; GFX11-NEXT: s_and_b32 s4, s3, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v22, v24, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT: v_add_nc_u32_e32 v25, v27, v23
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v20
-; GFX11-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25
-; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v23
-; GFX11-NEXT: v_add_f32_e64 v26, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20
-; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v24
-; GFX11-NEXT: v_add_f32_e64 v28, 0x40c00000, s3
-; GFX11-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v23, v26, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v21
-; GFX11-NEXT: v_bfe_u32 v25, v28, 16, 1
-; GFX11-NEXT: s_and_b32 s3, s2, 0xffff0000
-; GFX11-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v22
-; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v26
-; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v24
-; GFX11-NEXT: v_add_nc_u32_e32 v24, v25, v28
-; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-NEXT: v_or_b32_e32 v29, 0x400000, v28
-; GFX11-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24
-; GFX11-NEXT: v_bfe_u32 v30, v25, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v31, 0x40c00000, s2
-; GFX11-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-NEXT: s_and_b32 s2, s1, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v27, v30, v25
-; GFX11-NEXT: v_bfe_u32 v28, v31, 16, 1
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v29, 0x40c00000, s2
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v26
-; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27
-; GFX11-NEXT: v_add_nc_u32_e32 v27, v28, v31
-; GFX11-NEXT: v_or_b32_e32 v28, 0x400000, v25
-; GFX11-NEXT: v_add_f32_e64 v30, 0x40c00000, s1
-; GFX11-NEXT: v_bfe_u32 v33, v29, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v24
-; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27
-; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v31
-; GFX11-NEXT: s_and_b32 s1, s0, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v28, v30, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v33, v29
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v26
-; GFX11-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31
-; GFX11-NEXT: v_or_b32_e32 v31, 0x400000, v29
-; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26
-; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v30
-; GFX11-NEXT: v_add_f32_e64 v34, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v29, v32, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v32
-; GFX11-NEXT: v_bfe_u32 v31, v34, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v27
-; GFX11-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v28
-; GFX11-NEXT: v_add_nc_u32_e32 v28, v29, v32
-; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v178
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v30
-; GFX11-NEXT: v_add_nc_u32_e32 v30, v31, v34
-; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v178
-; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28
-; GFX11-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; GFX11-NEXT: v_lshl_or_b32 v109, v5, 16, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30
-; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
-; GFX11-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v29
-; GFX11-NEXT: v_bfe_u32 v35, v31, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v33
-; GFX11-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v179
-; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v31
-; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v179
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v31
-; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v180
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v34, v36, 16, 1
-; GFX11-NEXT: v_bfe_u32 v33, v35, 16, 1
-; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v180
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v33, v33, v35
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32
-; GFX11-NEXT: v_bfe_u32 v36, v37, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_bfe_u32 v35, v38, 16, 1
-; GFX11-NEXT: v_lshl_or_b32 v178, v31, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v36, v37
-; GFX11-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v38
-; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v182
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37
-; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT: v_lshl_or_b32 v179, v32, 16, v34
-; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30
-; GFX11-NEXT: v_lshl_or_b32 v136, v2, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_bfe_u32 v37, v36, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v36
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v181
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38
-; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v181
-; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33
-; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1
-; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v180, v31, 16, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v37
-; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v170
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v34, v36, v38
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37
-; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v39, v36, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_lshl_or_b32 v182, v31, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v37, v38, v35
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v39, v36
-; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v36
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33
-; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v169
-; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v169
-; GFX11-NEXT: v_lshl_or_b32 v181, v32, 16, v33
-; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-NEXT: v_and_b32_e32 v38, 0xffff0000, v176
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo
-; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v39
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v37
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v34
-; GFX11-NEXT: v_bfe_u32 v32, v36, 16, 1
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v37
-; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v176
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v36
-; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
-; GFX11-NEXT: v_bfe_u32 v37, v38, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v49, v35, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT: v_lshl_or_b32 v170, v33, 16, v31
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v36, v49, v35
-; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v174
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36
-; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36
-; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v174
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v37, v35, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39
-; GFX11-NEXT: v_lshl_or_b32 v169, v31, 16, v32
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v37, v37, v35
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v31, v36, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT: v_and_b32_e32 v32, 0xffff0000, v171
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v177
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v36
-; GFX11-NEXT: v_lshl_or_b32 v176, v33, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37
-; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v35
-; GFX11-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171
-; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v36
-; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37
-; GFX11-NEXT: v_bfe_u32 v37, v32, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v32
-; GFX11-NEXT: v_bfe_u32 v50, v38, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32
-; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v177
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: v_bfe_u32 v49, v37, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v36, v50, v38
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v184
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37
-; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_and_b32_e32 v48, 0xffff0000, v184
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35
-; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v37
-; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v50
-; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT: v_bfe_u32 v48, v37, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v39, v38, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35
-; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36
-; GFX11-NEXT: v_lshl_or_b32 v174, v33, 16, v31
-; GFX11-NEXT: v_lshl_or_b32 v171, v32, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v48, v37
-; GFX11-NEXT: v_and_b32_e32 v33, 0xffff0000, v175
-; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v175
-; GFX11-NEXT: v_add_nc_u32_e32 v39, v39, v38
-; GFX11-NEXT: v_lshl_or_b32 v177, v35, 16, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v37
-; GFX11-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39
-; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v38
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1
-; GFX11-NEXT: v_bfe_u32 v39, v34, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v173
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v173
-; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v33
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v36, v37, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v37, v39, v34
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v34
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v35
-; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT: v_lshl_or_b32 v122, v3, 16, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v48
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v35
-; GFX11-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v38
-; GFX11-NEXT: v_add_f32_e32 v48, 0x40c00000, v48
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v172
-; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v172
-; GFX11-NEXT: v_add_nc_u32_e32 v36, v36, v38
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_or_b32_e32 v55, 0x400000, v48
-; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-NEXT: v_add_f32_e32 v39, 0x40c00000, v39
-; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT: v_bfe_u32 v50, v37, 16, 1
-; GFX11-NEXT: v_bfe_u32 v38, v39, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v39
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
-; GFX11-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37
-; GFX11-NEXT: v_bfe_u32 v51, v48, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v39
-; GFX11-NEXT: v_or_b32_e32 v53, 0x400000, v37
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49
-; GFX11-NEXT: v_bfe_u32 v52, v50, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v51, v51, v48
-; GFX11-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35
-; GFX11-NEXT: v_add_nc_u32_e32 v52, v52, v50
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
-; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52
-; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v50
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v38
-; GFX11-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_lshl_or_b32 v184, v32, 16, v31
-; GFX11-NEXT: v_lshl_or_b32 v175, v33, 16, v34
-; GFX11-NEXT: v_and_b32_e32 v38, 0xffff, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v48
-; GFX11-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX11-NEXT: v_lshl_or_b32 v173, v35, 16, v36
-; GFX11-NEXT: v_lshl_or_b32 v97, v8, 16, v10
-; GFX11-NEXT: v_and_b32_e32 v48, 0xffff, v48
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v37
-; GFX11-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v86, v9, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v76, v11, 16, v13
-; GFX11-NEXT: v_lshl_or_b32 v67, v14, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v172, v37, 16, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v39
-; GFX11-NEXT: v_lshl_or_b32 v59, v16, 16, v19
-; GFX11-NEXT: v_lshl_or_b32 v52, v18, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v46, v21, 16, v23
-; GFX11-NEXT: v_lshl_or_b32 v41, v22, 16, v25
-; GFX11-NEXT: v_lshl_or_b32 v183, v39, 16, v48
-; GFX11-NEXT: v_lshl_or_b32 v37, v24, 16, v27
-; GFX11-NEXT: v_lshl_or_b32 v34, v26, 16, v28
-; GFX11-NEXT: v_lshl_or_b32 v32, v29, 16, v30
-; GFX11-NEXT: .LBB43_3: ; %end
-; GFX11-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46
-; GFX11-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86
-; GFX11-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76
-; GFX11-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136
-; GFX11-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122
-; GFX11-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172
-; GFX11-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175
-; GFX11-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
-; GFX11-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
-; GFX11-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
-; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: scratch_load_b32 v184, off, s32
-; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:4
-; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:8
-; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:12
-; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:16
-; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:20
-; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:24
-; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:28
-; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:32
-; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:36
-; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:40
-; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:44
-; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:48
-; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:52
-; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:56
-; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:60
-; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:64
-; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:68
-; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:72
-; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:76
-; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:80
-; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:84
-; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:88
-; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:92
-; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:96
-; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:100
-; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:104
-; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:108
-; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:112
-; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:116
-; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:120
-; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:124
-; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:128
-; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:132
-; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:136
-; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:140
-; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:144
-; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:148
-; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:152
-; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:156
-; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:160
-; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:164
-; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:168
-; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:172
-; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:176
-; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:180
-; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:184
-; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:188
-; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:192
-; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:196
-; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:200
-; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:204
-; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:208
-; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:212
-; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:216
-; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:220
-; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:224
-; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:228
-; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:232
-; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:236
-; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:240
-; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:244
-; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:248
-; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:252
-; GFX11-NEXT: s_clause 0x8
-; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:256
-; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:260
-; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:264
-; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:268
-; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:272
-; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:276
-; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:280
-; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:284
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:288
-; GFX11-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34
-; GFX11-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52
-; GFX11-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177
-; GFX11-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181
-; GFX11-NEXT: v_mov_b32_e32 v28, v182
-; GFX11-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB43_4:
-; GFX11-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
-; GFX11-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73
-; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78
-; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84
-; GFX11-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91
-; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99
-; GFX11-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108
-; GFX11-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118
-; GFX11-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129
-; GFX11-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141
-; GFX11-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154
-; GFX11-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168
-; GFX11-NEXT: s_branch .LBB43_2
+; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v32f32_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:156
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:28
+; GFX11-TRUE16-NEXT: s_clause 0x6
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v167, v13 :: v_dual_mov_b32 v176, v12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v177, v11 :: v_dual_mov_b32 v178, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v179, v9 :: v_dual_mov_b32 v180, v8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v181, v7 :: v_dual_mov_b32 v182, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v183, v5 :: v_dual_mov_b32 v168, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v169, v3 :: v_dual_mov_b32 v170, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v171, v1 :: v_dual_mov_b32 v172, v0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v174, s28 :: v_dual_mov_b32 v173, s29
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v135, s0 :: v_dual_mov_b32 v134, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v132, s2 :: v_dual_mov_b32 v129, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v125, s16 :: v_dual_mov_b32 v120, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v114, s18 :: v_dual_mov_b32 v107, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v99, s20 :: v_dual_mov_b32 v90, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v80, s22 :: v_dual_mov_b32 v69, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v57, s24 :: v_dual_mov_b32 v44, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_3
+; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s27, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s27, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s26, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s25, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s25, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xffff0000
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v8 :: v_dual_add_nc_u32 v7, v7, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s24, 16
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v7, v2 :: v_dual_add_nc_u32 v7, v8, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v9, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v3, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v1.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 16, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.h, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s23, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s22, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s22, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s21, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s20, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v90.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s20, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s19, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s18, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v107.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s17, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s17, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v120, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v120.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s16, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s3, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v125, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v125.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_and_b32 s3, s2, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s1, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v167
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v167
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v167.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v176
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v176
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v176.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v177
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v177
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v177.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v178
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v178
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v178, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v178.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v179
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v179
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v179, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v180
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v180
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v180.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v181
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v181
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v181.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v182
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v182
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v182.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v183
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v183
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v168
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v168
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v168, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v168.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v169
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v169
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v169, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v169.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v170
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v170
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v170, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v170.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v171
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v171
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v171, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v171.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v172
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v172
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v172, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v172.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v173
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v173
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v173, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v173.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v174
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v174
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v174, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v174.h, v0.l
+; GFX11-TRUE16-NEXT: .LBB43_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v125 :: v_dual_mov_b32 v5, v120
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v114 :: v_dual_mov_b32 v7, v107
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v99 :: v_dual_mov_b32 v9, v90
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, v57 :: v_dual_mov_b32 v13, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v30 :: v_dual_mov_b32 v17, v173
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v174 :: v_dual_mov_b32 v19, v171
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0x6
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:280
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v135 :: v_dual_mov_b32 v1, v134
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v132 :: v_dual_mov_b32 v3, v129
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, v80 :: v_dual_mov_b32 v11, v69
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v182 :: v_dual_mov_b32 v27, v179
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v180 :: v_dual_mov_b32 v29, v177
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v178 :: v_dual_mov_b32 v31, v167
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v30, v176
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB43_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166
+; GFX11-TRUE16-NEXT: s_branch .LBB43_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v32f32_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:276
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:272
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:268
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:264
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:260
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:256
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:252
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:248
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:244
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:240
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:236
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:232
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:228
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:224
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:220
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:216
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:212
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:208
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:204
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:200
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:196
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:192
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:188
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:184
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:180
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:176
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164
+; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:148
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:144
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:140
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:136
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:128
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:124
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:120
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:116
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:108
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:104
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:96
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:92
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:88
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v139, s32 offset:84
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v140, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v141, s32 offset:76
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v142, s32 offset:72
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v143, s32 offset:68
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v152, s32 offset:64
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v153, s32 offset:60
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v154, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v155, s32 offset:52
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v156, s32 offset:48
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36
+; GFX11-FAKE16-NEXT: s_clause 0x8
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v171, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v172, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v173, s32 offset:12
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v174, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v175, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v184, s32
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB43_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB43_3
+; GFX11-FAKE16-NEXT: .LBB43_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s27, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s27, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s26, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s25, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s25, 0xffff0000
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v183
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v9, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v3, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v10, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s24, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s23, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v7, v9
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v151, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s22, 0xffff0000
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v12, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v11, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s22, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v9, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s21, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v14, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v16, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s20, 0xffff0000
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v12, v16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s20, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-FAKE16-NEXT: v_bfe_u32 v18, v12, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s19, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v18, v12
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v19, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s19, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v16, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v21, v17, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v19
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s18, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v18, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, v21, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v13
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v20, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s18, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v18
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v22, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v20, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v22, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s17, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, v17, v20
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v20
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, v19, v22
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s17, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v22
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18
+; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v19, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v24, v19
+; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v25, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v23, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s16, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v20
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v22, v25
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v19
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v24, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v27, v23, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v25
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s3, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v24, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, v27, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v20
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v23
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v26, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v24
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v28, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v26, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT: v_bfe_u32 v25, v28, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s3, s2, 0xffff0000
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v22
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v26
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v24
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, v25, v28
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v28
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24
+; GFX11-FAKE16-NEXT: v_bfe_u32 v30, v25, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v31, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s1, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v30, v25
+; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v31, 16, 1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v29, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v26
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v28, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v25
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v30, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v29, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, 0x400000, v31
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s0, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v30, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v33, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v26
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, 0x400000, v29
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v32, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, 0x400000, v30
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v34, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v29, v32, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v34, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v28
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, v29, v32
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v178
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v30
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, v31, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v178
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v109, v5, 16, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v29
+; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v31, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v33
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v179
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v179
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v31
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v180
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v180
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v33, v35
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v178, v31, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v36, v37
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v182
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v179, v32, 16, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v136, v2, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v181
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v181
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33
+; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v180, v31, 16, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v170
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v36, v38
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v182, v31, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v38, v35
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v39, v36
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v169
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v169
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v181, v32, 16, v33
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v176
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v34
+; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v37
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v176
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v32, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v170, v33, 16, v31
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v49, v35
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v174
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v174
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v169, v31, 16, v32
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v37, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v171
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v177
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v31, v36
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v176, v33, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v32, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v177
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v50, v38
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v184
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v184
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v50
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v37, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v174, v33, 16, v31
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v171, v32, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v48, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v175
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v175
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, v39, v38
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v177, v35, 16, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v38
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v34, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v173
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v173
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v33
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v37, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v39, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v34
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v122, v3, 16, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v48
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v48
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v172
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v172
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v36, v38
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v48
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v39
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v39, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v39
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37
+; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v48, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v39
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v37
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49
+; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v50, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v51, v51, v48
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v52, v52, v50
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v50
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v38
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v184, v32, 16, v31
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v175, v33, 16, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v173, v35, 16, v36
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v97, v8, 16, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v48
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v37
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v86, v9, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v76, v11, 16, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v67, v14, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v172, v37, 16, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v39
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v59, v16, 16, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v52, v18, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v46, v21, 16, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v41, v22, 16, v25
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v183, v39, 16, v48
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v24, 16, v27
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v26, 16, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v29, 16, v30
+; GFX11-FAKE16-NEXT: .LBB43_3: ; %end
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
+; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v173, off, s32 offset:12
+; GFX11-FAKE16-NEXT: scratch_load_b32 v172, off, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_load_b32 v171, off, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_load_b32 v170, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_b32 v169, off, s32 offset:28
+; GFX11-FAKE16-NEXT: scratch_load_b32 v168, off, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_load_b32 v159, off, s32 offset:36
+; GFX11-FAKE16-NEXT: scratch_load_b32 v158, off, s32 offset:40
+; GFX11-FAKE16-NEXT: scratch_load_b32 v157, off, s32 offset:44
+; GFX11-FAKE16-NEXT: scratch_load_b32 v156, off, s32 offset:48
+; GFX11-FAKE16-NEXT: scratch_load_b32 v155, off, s32 offset:52
+; GFX11-FAKE16-NEXT: scratch_load_b32 v154, off, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_load_b32 v153, off, s32 offset:60
+; GFX11-FAKE16-NEXT: scratch_load_b32 v152, off, s32 offset:64
+; GFX11-FAKE16-NEXT: scratch_load_b32 v143, off, s32 offset:68
+; GFX11-FAKE16-NEXT: scratch_load_b32 v142, off, s32 offset:72
+; GFX11-FAKE16-NEXT: scratch_load_b32 v141, off, s32 offset:76
+; GFX11-FAKE16-NEXT: scratch_load_b32 v140, off, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_load_b32 v139, off, s32 offset:84
+; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:88
+; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:92
+; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:96
+; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:104
+; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:108
+; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116
+; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120
+; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124
+; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128
+; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136
+; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:140
+; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:144
+; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:148
+; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:152
+; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:156
+; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:160
+; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:164
+; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:168
+; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:172
+; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:176
+; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:180
+; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:184
+; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:188
+; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:192
+; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:196
+; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:200
+; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:204
+; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:208
+; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:212
+; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:216
+; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:220
+; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:224
+; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:228
+; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:232
+; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:236
+; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:240
+; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244
+; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248
+; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252
+; GFX11-FAKE16-NEXT: s_clause 0x8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256
+; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260
+; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264
+; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:268
+; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:272
+; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:276
+; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:280
+; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:284
+; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:288
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v28, v182
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB43_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168
+; GFX11-FAKE16-NEXT: s_branch .LBB43_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -78968,271 +80508,205 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v66.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v160.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v33.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v35.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v36.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v149.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v149.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v64.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v148.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v37.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v148.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v147.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v146.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v54.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v145.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v144.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v37.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v133.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v119.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v116.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v49.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v114.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v113.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v102.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v99.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v96.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v85.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v82.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v81.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v71.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v69.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v68.l
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v133.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v132.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v131.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v130.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v134.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v9.h, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v5.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v6.h, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v129.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v128.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v119.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v118.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v10.l, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v50.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v117.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v116.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v115.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v49.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.h, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v114.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v113.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v112.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v103.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v13.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v14.l, v14.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v102.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v100.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v99.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.h, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v14.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v15.h, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v16.h, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v98.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v97.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v96.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v87.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v17.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v18.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v19.l, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v25.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v36.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v86.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v26.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v85.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v84.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v35.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.h, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v19.h, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v20.h, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v21.h, v22.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v83.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v82.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v81.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v21.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v22.l, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v23.l, v23.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v24.l, v24.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v71.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v70.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 8, v69.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.h, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v24.h, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v25.h, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v26.h, v27.l
; GFX11-TRUE16-NEXT: s_clause 0x5
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:64
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:80
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off offset:96
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[29:32], off offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v16i64_to_v128i8:
@@ -88136,61 +89610,61 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:368
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:352
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:328
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:320
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:304
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:296
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:288
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:280
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:264
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:240
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:224
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_b32 v114, off, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v113, off, s32 offset:388
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:104
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:112
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128
@@ -88205,121 +89679,123 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:148
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.l, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v18.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v6.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v19.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v21.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v70.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v67.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v66.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v64.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v54.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v53.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v52.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v51.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v50.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v29.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v51.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.l, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v71.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v70.h
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v113
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v83.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v84.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v84.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v85.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v85.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v96.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v96.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v97.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v97.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v98.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v100.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v101.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v102.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v102.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v160.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v161.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v161.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v162.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v162.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v163.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v163.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v164.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v164.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.l, 8, v165.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v80.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v55.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v55.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v52.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v31.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v31.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -88330,215 +89806,179 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB58_4
; GFX11-TRUE16-NEXT: .LBB58_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB58_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v146.l
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v0.l, v151.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v151.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v0.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v150.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v149, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v1.h, v150.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v145.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v134.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v133.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v133.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v132.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v131.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v148.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v130.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v149, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v2.l, v148.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v134.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v145.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v130.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v149, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v3.l, v147.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v147.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v118.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v149, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v4.l, v144.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v4.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v133.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v117.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v129.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v129.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v128.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v128.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v116.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v116.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v149, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v5.l, v135.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v5.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v132.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v129.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v113.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v112.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v149, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v6.l, v133.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v6.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v128.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v103.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v101.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v149, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v7.l, v131.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v7.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v100.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v99.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v149, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v8.l, v129.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v96.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v149, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v9.l, v128.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v9.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v113.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v86.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v149, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v10.l, v117.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v10.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v102.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v84.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v82.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v112.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v112.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v103.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v100.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v99.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v99.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v87.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v87.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v86.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v83.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v82.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v82.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h
; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v81.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v149, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v11.l, v116.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v11.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v99.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v149, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v12.l, v114.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v12.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v97.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v69.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v68.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v149, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v13.l, v112.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v13.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v87.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v67.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v65.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v149, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v14.l, v102.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v14.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v85.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v83.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v55.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v70.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v69.h
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v68.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v65.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v65.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v64.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v53.l
; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v50.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v149, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v15.l, v100.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v15.l, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v82.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l
; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v149, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v16.l, v98.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v16.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.h
; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h
; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v149, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v17.l, v97.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v17.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v70.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v68.l
; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v38.h
; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v149, v17
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v18.l, v87.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v18.l, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v66.h
; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v37.l
; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v149, v18
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v19.l, v85.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v19.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v64.h
; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v149, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v20.l, v83.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v20.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v55.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v54.l
; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h
; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v149, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v21.l, v81.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v21.l, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v53.l
; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v33.l
; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v149, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v22.l, v71.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v22.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v52.l
; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v150.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v150.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v151.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v151.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v146.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v147.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v147.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v148.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v148.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v134.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v135.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v144.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v144.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v130.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v130.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v131.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v131.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v132.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v117.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v117.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v118.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v119.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v113.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v113.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v114.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v114.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v115.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v100.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v101.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v102.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v102.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v96.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v96.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v97.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v97.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v98.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v83.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v84.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v84.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v85.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v85.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v70.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v71.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v80.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v80.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v66.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v66.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v67.l
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v67.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v68.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v55.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v50.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v30.l, v51.l
+; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v51.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v52.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
@@ -88560,433 +90000,329 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v149, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v23.l, v70.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v23.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v51.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v149, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v24.l, v67.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v24.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v149, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v25.l, v66.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v25.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v149, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v26.l, v64.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v26.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v149, v26
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v27.l, v54.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v27.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v149, v27
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v28.l, v53.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v28.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v149, v28
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v29.l, v52.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v29.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v149, v29
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v30.l, v51.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v149, v30
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v31.l, v50.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v149, v31
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2
; GFX11-TRUE16-NEXT: .LBB58_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v146.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v145.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v133.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v133.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v129.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v129.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v128.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v128.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v116.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v116.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v112.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v103.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v99.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v99.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v87.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v87.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v86.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v83.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v82.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v82.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v81.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v70.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v69.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v68.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v65.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, v65.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v64.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v64.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, v53.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, v50.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, v38.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, v38.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, v34.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, v32.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v151.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v150.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v134.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v31, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v148.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v31, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v31.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v31, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v130.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v130.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v144.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v145.l, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v31, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v118.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v117.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v133.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v133.h, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v31, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v131.h, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v132.l, v6.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v31, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v113.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v129.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v129.h, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v31, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v128.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v128.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v103.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v31, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v101.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v117.h, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v118.l, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v31, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v116.l, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v116.h, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v99.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v31, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v96.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v96.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v31, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.h, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v113.l, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v86.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v31, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v84.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v84.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v102.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v102.h, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v31, v16
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v100.h, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.l, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v82.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v31, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v80.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v80.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v98.h, v15.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v99.l, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v31, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v97.l, v16.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v97.h, v16.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v31, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v17.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v67.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v87.l, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v87.h, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v31, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v85.h, v18.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v65.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v31, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v19.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v55.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v50.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v83.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v83.h, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v31, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v81.h, v20.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v82.l, v20.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v31, v23
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v21.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v48.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.l, v21.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v71.h, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v31, v24
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v22.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v31, v25
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v23.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v38.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v67.h, v23.l
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v68.l, v23.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.l, v24.l
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v66.h, v24.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v25.h
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v64.l, v25.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v64.h, v25.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h
; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v31, v28
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v55.l, v26.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v31, v29
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v27.h
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v53.h, v27.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v54.l, v27.h
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v31, v30
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v34.h, 0x300, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v53.l, v28.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v31, v34
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v29.h
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v32.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v51.h, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v52.l, v29.h
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h
; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l
; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v32.h
-; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v50.h, v30.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v51.l, v30.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v31, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.h, 0x300, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32
+; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v150.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v150.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v151.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v151.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v147.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v148.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v134.h, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v144.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v144.h, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v130.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v130.h, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v131.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v131.h, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v132.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v117.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v117.h, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v118.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v118.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v119.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v113.l, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v113.h, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v115.l, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v100.h, v14.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v101.l, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.h, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v102.l, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v102.h, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v96.l, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v96.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v97.l, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v97.h, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v98.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v83.h, v19.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v84.l, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v84.h, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v85.l, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v85.h, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v70.h, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v71.l, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.h, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v80.l, v23.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v80.h, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v66.l, v24.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.h, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v67.l, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v67.h, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v68.l, v26.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.h, v27.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v54.l, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.h, v28.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v55.l, v28.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.h, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v50.h, v29.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v51.l, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v51.h, v30.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v52.l, v31.l
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v52.h, v31.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v10.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v14.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v16.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v16.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v17.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v18.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v19.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v20.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v20.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v21.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v21.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v22.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v23.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v24.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v25.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v25.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v26.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v26.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v27.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v28.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v30.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v30.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v31.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v31.h
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -101796,870 +103132,1844 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: .LBB63_4:
; GFX9-NEXT: s_branch .LBB63_2
;
-; GFX11-LABEL: bitcast_v64bf16_to_v16i64_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:288
-; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:284
-; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:280
-; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:276
-; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:272
-; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:268
-; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:264
-; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:260
-; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:256
-; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:252
-; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:248
-; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:244
-; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:240
-; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:236
-; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:232
-; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:228
-; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:224
-; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:220
-; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:216
-; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:212
-; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:208
-; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:204
-; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:200
-; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:196
-; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:192
-; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:188
-; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:184
-; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:180
-; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:176
-; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:172
-; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:168
-; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:164
-; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:160
-; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:156
-; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:152
-; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:148
-; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:144
-; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:140
-; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:136
-; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:132
-; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:128
-; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:124
-; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:120
-; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:116
-; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:112
-; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:108
-; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:104
-; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:100
-; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:96
-; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:92
-; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:88
-; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:84
-; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:80
-; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:76
-; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:72
-; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:68
-; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:64
-; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:60
-; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:56
-; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:52
-; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:48
-; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:44
-; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:40
-; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:36
-; GFX11-NEXT: s_clause 0x8
-; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:32
-; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:28
-; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:24
-; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:20
-; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:16
-; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:12
-; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:8
-; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:4
-; GFX11-NEXT: scratch_store_b32 off, v184, s32
-; GFX11-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12
-; GFX11-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9
-; GFX11-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7
-; GFX11-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3
-; GFX11-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4
-; GFX11-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0
-; GFX11-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1
-; GFX11-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB63_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2
-; GFX11-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3
-; GFX11-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18
-; GFX11-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19
-; GFX11-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22
-; GFX11-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23
-; GFX11-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26
-; GFX11-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB63_3
-; GFX11-NEXT: .LBB63_2: ; %cmp.true
-; GFX11-NEXT: s_and_b32 s5, s27, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s4, s27, 16
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s5
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s6, s26, 16
-; GFX11-NEXT: s_and_b32 s4, s26, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1
-; GFX11-NEXT: s_lshl_b32 s7, s25, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: s_and_b32 s5, s25, 0xffff0000
-; GFX11-NEXT: s_and_b32 s4, s24, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s5
-; GFX11-NEXT: v_and_b32_e32 v51, 0xffff0000, v183
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s7
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_bfe_u32 v3, v8, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v3, v8
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v10, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: s_lshl_b32 s4, s24, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v6
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v8
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s4
-; GFX11-NEXT: s_and_b32 s4, s23, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2
-; GFX11-NEXT: v_bfe_u32 v7, v9, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v7, v9
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s4, s23, 16
-; GFX11-NEXT: v_lshl_or_b32 v151, v0, 16, v1
-; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT: v_bfe_u32 v11, v7, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: s_and_b32 s4, s22, 0xffff0000
-; GFX11-NEXT: v_bfe_u32 v9, v12, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v11, v7
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s4, s22, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s4
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v9, v12
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v7
-; GFX11-NEXT: v_bfe_u32 v14, v10, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
-; GFX11-NEXT: s_and_b32 s4, s21, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v9, v11, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v14, v10
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v7
-; GFX11-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v10
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: s_lshl_b32 s4, s21, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v11
-; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s4
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v10, v13, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX11-NEXT: v_bfe_u32 v12, v16, 16, 1
-; GFX11-NEXT: s_and_b32 s4, s20, 0xffff0000
-; GFX11-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v9
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v11, v12, v16
-; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: s_lshl_b32 s4, s20, 16
-; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v16
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
-; GFX11-NEXT: v_bfe_u32 v18, v12, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4
-; GFX11-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT: s_and_b32 s4, s19, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v14, v18, v12
-; GFX11-NEXT: v_bfe_u32 v16, v19, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v9
-; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s4, s19, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v14, v16, v19
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v12
-; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s4
-; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
-; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v19
-; GFX11-NEXT: s_and_b32 s4, s18, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v16, v18, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT: v_add_nc_u32_e32 v19, v21, v17
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v13
-; GFX11-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19
-; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v17
-; GFX11-NEXT: v_add_f32_e64 v20, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: s_lshl_b32 s4, s18, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
-; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v18
-; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s4
-; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v17, v20, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX11-NEXT: v_bfe_u32 v19, v22, 16, 1
-; GFX11-NEXT: s_and_b32 s4, s17, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v17, v17, v20
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v16
-; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v20
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v18, v19, v22
-; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-NEXT: s_lshl_b32 s4, s17, 16
-; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v22
-; GFX11-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18
-; GFX11-NEXT: v_bfe_u32 v24, v19, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s4
-; GFX11-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-NEXT: s_and_b32 s4, s16, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v21, v24, v19
-; GFX11-NEXT: v_bfe_u32 v22, v25, 16, 1
-; GFX11-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16
-; GFX11-NEXT: v_add_f32_e64 v23, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s4, s16, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v20
-; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21
-; GFX11-NEXT: v_add_nc_u32_e32 v21, v22, v25
-; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19
-; GFX11-NEXT: v_add_f32_e64 v24, 0x40c00000, s4
-; GFX11-NEXT: v_bfe_u32 v27, v23, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21
-; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v25
-; GFX11-NEXT: s_and_b32 s4, s3, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v22, v24, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT: v_add_nc_u32_e32 v25, v27, v23
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v20
-; GFX11-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25
-; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v23
-; GFX11-NEXT: v_add_f32_e64 v26, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20
-; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v24
-; GFX11-NEXT: v_add_f32_e64 v28, 0x40c00000, s3
-; GFX11-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v23, v26, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v21
-; GFX11-NEXT: v_bfe_u32 v25, v28, 16, 1
-; GFX11-NEXT: s_and_b32 s3, s2, 0xffff0000
-; GFX11-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v22
-; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v26
-; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v24
-; GFX11-NEXT: v_add_nc_u32_e32 v24, v25, v28
-; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-NEXT: v_or_b32_e32 v29, 0x400000, v28
-; GFX11-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24
-; GFX11-NEXT: v_bfe_u32 v30, v25, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v31, 0x40c00000, s2
-; GFX11-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-NEXT: s_and_b32 s2, s1, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v27, v30, v25
-; GFX11-NEXT: v_bfe_u32 v28, v31, 16, 1
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v29, 0x40c00000, s2
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v26
-; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27
-; GFX11-NEXT: v_add_nc_u32_e32 v27, v28, v31
-; GFX11-NEXT: v_or_b32_e32 v28, 0x400000, v25
-; GFX11-NEXT: v_add_f32_e64 v30, 0x40c00000, s1
-; GFX11-NEXT: v_bfe_u32 v33, v29, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v24
-; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27
-; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v31
-; GFX11-NEXT: s_and_b32 s1, s0, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v28, v30, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v33, v29
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v26
-; GFX11-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31
-; GFX11-NEXT: v_or_b32_e32 v31, 0x400000, v29
-; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26
-; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v30
-; GFX11-NEXT: v_add_f32_e64 v34, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v29, v32, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v32
-; GFX11-NEXT: v_bfe_u32 v31, v34, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v27
-; GFX11-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v28
-; GFX11-NEXT: v_add_nc_u32_e32 v28, v29, v32
-; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v178
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v30
-; GFX11-NEXT: v_add_nc_u32_e32 v30, v31, v34
-; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v178
-; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28
-; GFX11-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; GFX11-NEXT: v_lshl_or_b32 v109, v5, 16, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30
-; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
-; GFX11-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v29
-; GFX11-NEXT: v_bfe_u32 v35, v31, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v33
-; GFX11-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v179
-; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v31
-; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v179
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v31
-; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v180
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v34, v36, 16, 1
-; GFX11-NEXT: v_bfe_u32 v33, v35, 16, 1
-; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v180
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v33, v33, v35
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32
-; GFX11-NEXT: v_bfe_u32 v36, v37, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_bfe_u32 v35, v38, 16, 1
-; GFX11-NEXT: v_lshl_or_b32 v178, v31, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v36, v37
-; GFX11-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v38
-; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v182
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37
-; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT: v_lshl_or_b32 v179, v32, 16, v34
-; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30
-; GFX11-NEXT: v_lshl_or_b32 v136, v2, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_bfe_u32 v37, v36, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v36
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v181
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38
-; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v181
-; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33
-; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1
-; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v180, v31, 16, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v37
-; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v170
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v34, v36, v38
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37
-; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v39, v36, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_lshl_or_b32 v182, v31, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v37, v38, v35
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v39, v36
-; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v36
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33
-; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v169
-; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v169
-; GFX11-NEXT: v_lshl_or_b32 v181, v32, 16, v33
-; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-NEXT: v_and_b32_e32 v38, 0xffff0000, v176
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo
-; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v39
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v37
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v34
-; GFX11-NEXT: v_bfe_u32 v32, v36, 16, 1
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v37
-; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v176
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v36
-; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
-; GFX11-NEXT: v_bfe_u32 v37, v38, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v49, v35, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT: v_lshl_or_b32 v170, v33, 16, v31
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v36, v49, v35
-; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v174
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36
-; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36
-; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v174
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v37, v35, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39
-; GFX11-NEXT: v_lshl_or_b32 v169, v31, 16, v32
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v37, v37, v35
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v31, v36, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT: v_and_b32_e32 v32, 0xffff0000, v171
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v177
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v36
-; GFX11-NEXT: v_lshl_or_b32 v176, v33, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37
-; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v35
-; GFX11-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171
-; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v36
-; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37
-; GFX11-NEXT: v_bfe_u32 v37, v32, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v32
-; GFX11-NEXT: v_bfe_u32 v50, v38, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32
-; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v177
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: v_bfe_u32 v49, v37, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v36, v50, v38
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v184
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37
-; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_and_b32_e32 v48, 0xffff0000, v184
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35
-; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v37
-; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v50
-; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT: v_bfe_u32 v48, v37, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v39, v38, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35
-; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36
-; GFX11-NEXT: v_lshl_or_b32 v174, v33, 16, v31
-; GFX11-NEXT: v_lshl_or_b32 v171, v32, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v48, v37
-; GFX11-NEXT: v_and_b32_e32 v33, 0xffff0000, v175
-; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v175
-; GFX11-NEXT: v_add_nc_u32_e32 v39, v39, v38
-; GFX11-NEXT: v_lshl_or_b32 v177, v35, 16, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v37
-; GFX11-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39
-; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v38
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1
-; GFX11-NEXT: v_bfe_u32 v39, v34, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v173
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v173
-; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v33
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v36, v37, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v37, v39, v34
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v34
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v35
-; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT: v_lshl_or_b32 v122, v3, 16, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v48
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v35
-; GFX11-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v38
-; GFX11-NEXT: v_add_f32_e32 v48, 0x40c00000, v48
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v172
-; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v172
-; GFX11-NEXT: v_add_nc_u32_e32 v36, v36, v38
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_or_b32_e32 v55, 0x400000, v48
-; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-NEXT: v_add_f32_e32 v39, 0x40c00000, v39
-; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT: v_bfe_u32 v50, v37, 16, 1
-; GFX11-NEXT: v_bfe_u32 v38, v39, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v39
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
-; GFX11-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37
-; GFX11-NEXT: v_bfe_u32 v51, v48, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v39
-; GFX11-NEXT: v_or_b32_e32 v53, 0x400000, v37
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49
-; GFX11-NEXT: v_bfe_u32 v52, v50, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v51, v51, v48
-; GFX11-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35
-; GFX11-NEXT: v_add_nc_u32_e32 v52, v52, v50
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
-; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52
-; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v50
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v38
-; GFX11-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_lshl_or_b32 v184, v32, 16, v31
-; GFX11-NEXT: v_lshl_or_b32 v175, v33, 16, v34
-; GFX11-NEXT: v_and_b32_e32 v38, 0xffff, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v48
-; GFX11-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX11-NEXT: v_lshl_or_b32 v173, v35, 16, v36
-; GFX11-NEXT: v_lshl_or_b32 v97, v8, 16, v10
-; GFX11-NEXT: v_and_b32_e32 v48, 0xffff, v48
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v37
-; GFX11-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v86, v9, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v76, v11, 16, v13
-; GFX11-NEXT: v_lshl_or_b32 v67, v14, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v172, v37, 16, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v39
-; GFX11-NEXT: v_lshl_or_b32 v59, v16, 16, v19
-; GFX11-NEXT: v_lshl_or_b32 v52, v18, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v46, v21, 16, v23
-; GFX11-NEXT: v_lshl_or_b32 v41, v22, 16, v25
-; GFX11-NEXT: v_lshl_or_b32 v183, v39, 16, v48
-; GFX11-NEXT: v_lshl_or_b32 v37, v24, 16, v27
-; GFX11-NEXT: v_lshl_or_b32 v34, v26, 16, v28
-; GFX11-NEXT: v_lshl_or_b32 v32, v29, 16, v30
-; GFX11-NEXT: .LBB63_3: ; %end
-; GFX11-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46
-; GFX11-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86
-; GFX11-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76
-; GFX11-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136
-; GFX11-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122
-; GFX11-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172
-; GFX11-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175
-; GFX11-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
-; GFX11-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
-; GFX11-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
-; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: scratch_load_b32 v184, off, s32
-; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:4
-; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:8
-; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:12
-; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:16
-; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:20
-; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:24
-; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:28
-; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:32
-; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:36
-; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:40
-; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:44
-; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:48
-; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:52
-; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:56
-; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:60
-; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:64
-; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:68
-; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:72
-; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:76
-; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:80
-; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:84
-; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:88
-; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:92
-; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:96
-; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:100
-; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:104
-; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:108
-; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:112
-; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:116
-; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:120
-; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:124
-; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:128
-; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:132
-; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:136
-; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:140
-; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:144
-; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:148
-; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:152
-; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:156
-; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:160
-; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:164
-; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:168
-; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:172
-; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:176
-; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:180
-; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:184
-; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:188
-; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:192
-; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:196
-; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:200
-; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:204
-; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:208
-; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:212
-; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:216
-; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:220
-; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:224
-; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:228
-; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:232
-; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:236
-; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:240
-; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:244
-; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:248
-; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:252
-; GFX11-NEXT: s_clause 0x8
-; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:256
-; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:260
-; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:264
-; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:268
-; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:272
-; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:276
-; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:280
-; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:284
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:288
-; GFX11-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34
-; GFX11-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52
-; GFX11-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177
-; GFX11-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181
-; GFX11-NEXT: v_mov_b32_e32 v28, v182
-; GFX11-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB63_4:
-; GFX11-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
-; GFX11-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73
-; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78
-; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84
-; GFX11-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91
-; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99
-; GFX11-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108
-; GFX11-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118
-; GFX11-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129
-; GFX11-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141
-; GFX11-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154
-; GFX11-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168
-; GFX11-NEXT: s_branch .LBB63_2
+; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v16i64_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:156
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:28
+; GFX11-TRUE16-NEXT: s_clause 0x6
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v167, v13 :: v_dual_mov_b32 v176, v12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v177, v11 :: v_dual_mov_b32 v178, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v179, v9 :: v_dual_mov_b32 v180, v8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v181, v7 :: v_dual_mov_b32 v182, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v183, v5 :: v_dual_mov_b32 v168, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v169, v3 :: v_dual_mov_b32 v170, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v171, v1 :: v_dual_mov_b32 v172, v0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v174, s28 :: v_dual_mov_b32 v173, s29
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB63_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v135, s0 :: v_dual_mov_b32 v134, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v132, s2 :: v_dual_mov_b32 v129, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v125, s16 :: v_dual_mov_b32 v120, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v114, s18 :: v_dual_mov_b32 v107, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v99, s20 :: v_dual_mov_b32 v90, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v80, s22 :: v_dual_mov_b32 v69, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v57, s24 :: v_dual_mov_b32 v44, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB63_3
+; GFX11-TRUE16-NEXT: .LBB63_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s27, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s27, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s26, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s25, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s25, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xffff0000
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v8 :: v_dual_add_nc_u32 v7, v7, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s24, 16
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v7, v2 :: v_dual_add_nc_u32 v7, v8, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v9, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v3, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v1.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 16, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.h, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s23, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s22, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s22, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s21, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s20, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v90.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s20, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s19, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s18, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v107.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s17, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s17, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v120, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v120.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s16, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s3, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v125, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v125.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_and_b32 s3, s2, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s1, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v167
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v167
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v167.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v176
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v176
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v176.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v177
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v177
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v177.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v178
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v178
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v178, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v178.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v179
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v179
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v179, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v180
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v180
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v180.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v181
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v181
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v181.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v182
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v182
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v182.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v183
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v183
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v168
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v168
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v168, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v168.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v169
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v169
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v169, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v169.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v170
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v170
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v170, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v170.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v171
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v171
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v171, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v171.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v172
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v172
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v172, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v172.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v173
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v173
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v173, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v173.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v174
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v174
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v174, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v174.h, v0.l
+; GFX11-TRUE16-NEXT: .LBB63_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v125 :: v_dual_mov_b32 v5, v120
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v114 :: v_dual_mov_b32 v7, v107
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v99 :: v_dual_mov_b32 v9, v90
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, v57 :: v_dual_mov_b32 v13, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v30 :: v_dual_mov_b32 v17, v173
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v174 :: v_dual_mov_b32 v19, v171
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0x6
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:280
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v135 :: v_dual_mov_b32 v1, v134
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v132 :: v_dual_mov_b32 v3, v129
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, v80 :: v_dual_mov_b32 v11, v69
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v182 :: v_dual_mov_b32 v27, v179
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v180 :: v_dual_mov_b32 v29, v177
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v178 :: v_dual_mov_b32 v31, v167
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v30, v176
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB63_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166
+; GFX11-TRUE16-NEXT: s_branch .LBB63_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v16i64_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:276
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:272
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:268
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:264
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:260
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:256
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:252
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:248
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:244
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:240
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:236
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:232
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:228
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:224
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:220
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:216
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:212
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:208
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:204
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:200
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:196
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:192
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:188
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:184
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:180
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:176
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164
+; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:148
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:144
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:140
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:136
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:128
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:124
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:120
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:116
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:108
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:104
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:96
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:92
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:88
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v139, s32 offset:84
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v140, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v141, s32 offset:76
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v142, s32 offset:72
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v143, s32 offset:68
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v152, s32 offset:64
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v153, s32 offset:60
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v154, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v155, s32 offset:52
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v156, s32 offset:48
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36
+; GFX11-FAKE16-NEXT: s_clause 0x8
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v171, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v172, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v173, s32 offset:12
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v174, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v175, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v184, s32
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB63_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB63_3
+; GFX11-FAKE16-NEXT: .LBB63_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s27, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s27, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s26, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s25, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s25, 0xffff0000
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v183
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v9, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v3, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v10, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s24, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s23, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v7, v9
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v151, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s22, 0xffff0000
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v12, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v11, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s22, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v9, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s21, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v14, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v16, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s20, 0xffff0000
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v12, v16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s20, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-FAKE16-NEXT: v_bfe_u32 v18, v12, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s19, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v18, v12
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v19, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s19, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v16, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v21, v17, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v19
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s18, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v18, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, v21, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v13
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v20, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s18, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v18
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v22, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v20, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v22, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s17, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, v17, v20
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v20
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, v19, v22
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s17, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v22
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18
+; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v19, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v24, v19
+; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v25, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v23, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s16, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v20
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v22, v25
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v19
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v24, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v27, v23, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v25
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s3, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v24, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, v27, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v20
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v23
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v26, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v24
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v28, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v26, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT: v_bfe_u32 v25, v28, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s3, s2, 0xffff0000
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v22
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v26
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v24
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, v25, v28
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v28
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24
+; GFX11-FAKE16-NEXT: v_bfe_u32 v30, v25, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v31, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s1, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v30, v25
+; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v31, 16, 1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v29, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v26
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v28, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v25
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v30, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v29, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, 0x400000, v31
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s0, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v30, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v33, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v26
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, 0x400000, v29
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v32, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, 0x400000, v30
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v34, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v29, v32, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v34, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v28
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, v29, v32
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v178
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v30
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, v31, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v178
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v109, v5, 16, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v29
+; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v31, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v33
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v179
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v179
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v31
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v180
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v180
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v33, v35
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v178, v31, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v36, v37
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v182
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v179, v32, 16, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v136, v2, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v181
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v181
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33
+; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v180, v31, 16, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v170
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v36, v38
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v182, v31, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v38, v35
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v39, v36
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v169
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v169
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v181, v32, 16, v33
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v176
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v34
+; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v37
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v176
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v32, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v170, v33, 16, v31
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v49, v35
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v174
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v174
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v169, v31, 16, v32
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v37, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v171
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v177
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v31, v36
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v176, v33, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v32, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v177
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v50, v38
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v184
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v184
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v50
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v37, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v174, v33, 16, v31
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v171, v32, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v48, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v175
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v175
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, v39, v38
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v177, v35, 16, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v38
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v34, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v173
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v173
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v33
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v37, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v39, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v34
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v122, v3, 16, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v48
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v48
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v172
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v172
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v36, v38
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v48
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v39
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v39, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v39
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37
+; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v48, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v39
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v37
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49
+; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v50, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v51, v51, v48
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v52, v52, v50
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v50
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v38
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v184, v32, 16, v31
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v175, v33, 16, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v173, v35, 16, v36
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v97, v8, 16, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v48
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v37
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v86, v9, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v76, v11, 16, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v67, v14, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v172, v37, 16, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v39
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v59, v16, 16, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v52, v18, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v46, v21, 16, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v41, v22, 16, v25
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v183, v39, 16, v48
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v24, 16, v27
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v26, 16, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v29, 16, v30
+; GFX11-FAKE16-NEXT: .LBB63_3: ; %end
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
+; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v173, off, s32 offset:12
+; GFX11-FAKE16-NEXT: scratch_load_b32 v172, off, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_load_b32 v171, off, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_load_b32 v170, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_b32 v169, off, s32 offset:28
+; GFX11-FAKE16-NEXT: scratch_load_b32 v168, off, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_load_b32 v159, off, s32 offset:36
+; GFX11-FAKE16-NEXT: scratch_load_b32 v158, off, s32 offset:40
+; GFX11-FAKE16-NEXT: scratch_load_b32 v157, off, s32 offset:44
+; GFX11-FAKE16-NEXT: scratch_load_b32 v156, off, s32 offset:48
+; GFX11-FAKE16-NEXT: scratch_load_b32 v155, off, s32 offset:52
+; GFX11-FAKE16-NEXT: scratch_load_b32 v154, off, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_load_b32 v153, off, s32 offset:60
+; GFX11-FAKE16-NEXT: scratch_load_b32 v152, off, s32 offset:64
+; GFX11-FAKE16-NEXT: scratch_load_b32 v143, off, s32 offset:68
+; GFX11-FAKE16-NEXT: scratch_load_b32 v142, off, s32 offset:72
+; GFX11-FAKE16-NEXT: scratch_load_b32 v141, off, s32 offset:76
+; GFX11-FAKE16-NEXT: scratch_load_b32 v140, off, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_load_b32 v139, off, s32 offset:84
+; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:88
+; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:92
+; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:96
+; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:104
+; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:108
+; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116
+; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120
+; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124
+; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128
+; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136
+; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:140
+; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:144
+; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:148
+; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:152
+; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:156
+; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:160
+; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:164
+; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:168
+; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:172
+; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:176
+; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:180
+; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:184
+; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:188
+; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:192
+; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:196
+; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:200
+; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:204
+; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:208
+; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:212
+; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:216
+; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:220
+; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:224
+; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:228
+; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:232
+; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:236
+; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:240
+; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244
+; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248
+; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252
+; GFX11-FAKE16-NEXT: s_clause 0x8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256
+; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260
+; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264
+; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:268
+; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:272
+; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:276
+; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:280
+; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:284
+; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:288
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v28, v182
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB63_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168
+; GFX11-FAKE16-NEXT: s_branch .LBB63_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -113114,271 +115424,205 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v66.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v160.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v33.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v35.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v36.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v149.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v149.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v64.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v148.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v37.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v148.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v147.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v146.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v54.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v145.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v144.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v37.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v133.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v119.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v116.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v49.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v114.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v113.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v102.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v99.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v96.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v85.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v82.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v81.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v71.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v69.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v68.l
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v133.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v132.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v131.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v130.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v134.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v9.h, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v5.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v6.h, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v129.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v128.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v119.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v118.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v10.l, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v50.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v117.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v116.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v115.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v49.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.h, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v114.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v113.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v112.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v103.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v13.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v14.l, v14.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v102.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v100.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v99.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.h, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v14.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v15.h, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v16.h, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v98.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v97.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v96.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v87.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v17.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v18.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v19.l, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v25.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v36.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v86.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v26.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v85.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v84.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v35.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.h, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v19.h, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v20.h, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v21.h, v22.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v83.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v82.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v81.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v21.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v22.l, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v23.l, v23.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v24.l, v24.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v71.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v70.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 8, v69.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.h, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v24.h, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v25.h, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v26.h, v27.l
; GFX11-TRUE16-NEXT: s_clause 0x5
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:64
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:80
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off offset:96
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[29:32], off offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v16f64_to_v128i8:
@@ -123405,61 +125649,61 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:368
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:352
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:328
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:320
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:304
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:296
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:288
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:280
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:264
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:240
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:224
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_b32 v114, off, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v113, off, s32 offset:388
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:104
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:112
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128
@@ -123474,121 +125718,123 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:148
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.l, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v18.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v6.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v19.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v21.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v70.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v67.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v66.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v64.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v54.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v53.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v52.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v51.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v50.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v29.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v51.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.l, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v71.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v70.h
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v113
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v83.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v84.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v84.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v85.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v85.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v96.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v96.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v97.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v97.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v98.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v100.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v101.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v102.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v102.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v160.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v161.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v161.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v162.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v162.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v163.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v163.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v164.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v164.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.l, 8, v165.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v80.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v55.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v55.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v52.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v31.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v31.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -123599,215 +125845,179 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB74_4
; GFX11-TRUE16-NEXT: .LBB74_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB74_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v146.l
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v0.l, v151.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v151.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v0.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v150.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v149, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v1.h, v150.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v145.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v134.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v133.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v133.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v132.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v131.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v148.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v130.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v149, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v2.l, v148.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v134.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v145.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v130.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v149, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v3.l, v147.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v147.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v118.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v149, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v4.l, v144.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v4.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v133.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v117.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v129.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v129.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v128.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v128.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v116.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v116.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v149, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v5.l, v135.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v5.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v132.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v129.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v113.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v112.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v149, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v6.l, v133.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v6.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v128.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v103.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v101.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v149, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v7.l, v131.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v7.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v100.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v99.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v149, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v8.l, v129.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v96.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v149, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v9.l, v128.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v9.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v113.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v86.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v149, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v10.l, v117.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v10.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v102.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v84.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v82.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v112.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v112.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v103.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v100.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v99.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v99.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v87.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v87.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v86.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v83.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v82.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v82.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h
; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v81.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v149, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v11.l, v116.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v11.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v99.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v149, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v12.l, v114.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v12.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v97.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v69.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v68.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v149, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v13.l, v112.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v13.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v87.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v67.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v65.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v149, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v14.l, v102.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v14.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v85.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v83.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v55.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v70.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v69.h
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v68.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v65.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v65.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v64.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v53.l
; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v50.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v149, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v15.l, v100.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v15.l, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v82.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l
; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v149, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v16.l, v98.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v16.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.h
; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h
; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v149, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v17.l, v97.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v17.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v70.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v68.l
; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v38.h
; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v149, v17
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v18.l, v87.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v18.l, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v66.h
; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v37.l
; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v149, v18
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v19.l, v85.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v19.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v64.h
; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v149, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v20.l, v83.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v20.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v55.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v54.l
; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h
; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v149, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v21.l, v81.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v21.l, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v53.l
; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v33.l
; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v149, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v22.l, v71.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v22.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v52.l
; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v150.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v150.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v151.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v151.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v146.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v147.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v147.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v148.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v148.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v134.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v135.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v144.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v144.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v130.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v130.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v131.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v131.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v132.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v117.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v117.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v118.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v119.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v113.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v113.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v114.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v114.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v115.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v100.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v101.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v102.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v102.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v96.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v96.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v97.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v97.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v98.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v83.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v84.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v84.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v85.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v85.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v70.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v71.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v80.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v80.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v66.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v66.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v67.l
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v67.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v68.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v55.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v50.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v30.l, v51.l
+; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v51.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v52.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
@@ -123829,433 +126039,329 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v149, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v23.l, v70.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v23.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v51.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v149, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v24.l, v67.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v24.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v149, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v25.l, v66.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v25.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v149, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v26.l, v64.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v26.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v149, v26
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v27.l, v54.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v27.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v149, v27
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v28.l, v53.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v28.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v149, v28
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v29.l, v52.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v29.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v149, v29
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v30.l, v51.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v149, v30
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v31.l, v50.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v149, v31
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB74_2
; GFX11-TRUE16-NEXT: .LBB74_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v146.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v145.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v133.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v133.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v129.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v129.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v128.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v128.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v116.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v116.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v112.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v103.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v99.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v99.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v87.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v87.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v86.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v83.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v82.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v82.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v81.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v70.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v69.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v68.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v65.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, v65.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v64.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v64.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, v53.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, v50.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, v38.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, v38.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, v34.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, v32.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v151.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v150.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v134.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v31, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v148.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v31, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v31.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v31, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v130.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v130.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v144.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v145.l, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v31, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v118.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v117.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v133.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v133.h, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v31, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v131.h, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v132.l, v6.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v31, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v113.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v129.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v129.h, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v31, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v128.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v128.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v103.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v31, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v101.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v117.h, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v118.l, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v31, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v116.l, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v116.h, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v99.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v31, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v96.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v96.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v31, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.h, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v113.l, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v86.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v31, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v84.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v84.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v102.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v102.h, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v31, v16
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v100.h, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.l, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v82.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v31, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v80.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v80.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v98.h, v15.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v99.l, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v31, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v97.l, v16.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v97.h, v16.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v31, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v17.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v67.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v87.l, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v87.h, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v31, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v85.h, v18.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v65.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v31, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v19.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v55.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v50.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v83.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v83.h, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v31, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v81.h, v20.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v82.l, v20.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v31, v23
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v21.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v48.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.l, v21.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v71.h, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v31, v24
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v22.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v31, v25
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v23.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v38.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v67.h, v23.l
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v68.l, v23.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.l, v24.l
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v66.h, v24.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v25.h
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v64.l, v25.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v64.h, v25.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h
; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v31, v28
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v55.l, v26.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v31, v29
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v27.h
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v53.h, v27.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v54.l, v27.h
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v31, v30
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v34.h, 0x300, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v53.l, v28.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v31, v34
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v29.h
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v32.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v51.h, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v52.l, v29.h
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h
; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l
; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v32.h
-; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v50.h, v30.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v51.l, v30.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v31, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.h, 0x300, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32
+; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v150.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v150.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v151.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v151.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v147.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v148.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v134.h, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v144.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v144.h, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v130.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v130.h, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v131.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v131.h, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v132.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v117.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v117.h, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v118.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v118.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v119.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v113.l, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v113.h, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v115.l, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v100.h, v14.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v101.l, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.h, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v102.l, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v102.h, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v96.l, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v96.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v97.l, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v97.h, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v98.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v83.h, v19.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v84.l, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v84.h, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v85.l, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v85.h, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v70.h, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v71.l, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.h, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v80.l, v23.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v80.h, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v66.l, v24.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.h, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v67.l, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v67.h, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v68.l, v26.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.h, v27.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v54.l, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.h, v28.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v55.l, v28.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.h, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v50.h, v29.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v51.l, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v51.h, v30.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v52.l, v31.l
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v52.h, v31.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v10.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v14.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v16.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v16.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v17.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v18.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v19.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v20.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v20.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v21.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v21.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v22.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v23.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v24.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v25.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v25.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v26.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v26.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v27.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v28.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v30.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v30.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v31.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v31.h
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -137013,870 +139119,1844 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
; GFX9-NEXT: .LBB79_4:
; GFX9-NEXT: s_branch .LBB79_2
;
-; GFX11-LABEL: bitcast_v64bf16_to_v16f64_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:288
-; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:284
-; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:280
-; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:276
-; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:272
-; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:268
-; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:264
-; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:260
-; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:256
-; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:252
-; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:248
-; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:244
-; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:240
-; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:236
-; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:232
-; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:228
-; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:224
-; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:220
-; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:216
-; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:212
-; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:208
-; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:204
-; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:200
-; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:196
-; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:192
-; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:188
-; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:184
-; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:180
-; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:176
-; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:172
-; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:168
-; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:164
-; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:160
-; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:156
-; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:152
-; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:148
-; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:144
-; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:140
-; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:136
-; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:132
-; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:128
-; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:124
-; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:120
-; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:116
-; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:112
-; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:108
-; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:104
-; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:100
-; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:96
-; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:92
-; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:88
-; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:84
-; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:80
-; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:76
-; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:72
-; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:68
-; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:64
-; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:60
-; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:56
-; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:52
-; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:48
-; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:44
-; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:40
-; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:36
-; GFX11-NEXT: s_clause 0x8
-; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:32
-; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:28
-; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:24
-; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:20
-; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:16
-; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:12
-; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:8
-; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:4
-; GFX11-NEXT: scratch_store_b32 off, v184, s32
-; GFX11-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12
-; GFX11-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9
-; GFX11-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7
-; GFX11-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3
-; GFX11-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4
-; GFX11-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0
-; GFX11-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1
-; GFX11-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB79_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2
-; GFX11-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3
-; GFX11-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18
-; GFX11-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19
-; GFX11-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22
-; GFX11-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23
-; GFX11-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26
-; GFX11-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB79_3
-; GFX11-NEXT: .LBB79_2: ; %cmp.true
-; GFX11-NEXT: s_and_b32 s5, s27, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s4, s27, 16
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s5
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s6, s26, 16
-; GFX11-NEXT: s_and_b32 s4, s26, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1
-; GFX11-NEXT: s_lshl_b32 s7, s25, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: s_and_b32 s5, s25, 0xffff0000
-; GFX11-NEXT: s_and_b32 s4, s24, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s5
-; GFX11-NEXT: v_and_b32_e32 v51, 0xffff0000, v183
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s7
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_bfe_u32 v3, v8, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v3, v8
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v10, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: s_lshl_b32 s4, s24, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v6
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v8
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s4
-; GFX11-NEXT: s_and_b32 s4, s23, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2
-; GFX11-NEXT: v_bfe_u32 v7, v9, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v7, v9
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s4, s23, 16
-; GFX11-NEXT: v_lshl_or_b32 v151, v0, 16, v1
-; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT: v_bfe_u32 v11, v7, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: s_and_b32 s4, s22, 0xffff0000
-; GFX11-NEXT: v_bfe_u32 v9, v12, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v11, v7
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s4, s22, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s4
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v9, v12
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v7
-; GFX11-NEXT: v_bfe_u32 v14, v10, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
-; GFX11-NEXT: s_and_b32 s4, s21, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v9, v11, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v14, v10
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v7
-; GFX11-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v10
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: s_lshl_b32 s4, s21, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v11
-; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s4
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v10, v13, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX11-NEXT: v_bfe_u32 v12, v16, 16, 1
-; GFX11-NEXT: s_and_b32 s4, s20, 0xffff0000
-; GFX11-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v9
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v11, v12, v16
-; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: s_lshl_b32 s4, s20, 16
-; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v16
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
-; GFX11-NEXT: v_bfe_u32 v18, v12, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4
-; GFX11-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT: s_and_b32 s4, s19, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v14, v18, v12
-; GFX11-NEXT: v_bfe_u32 v16, v19, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v9
-; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s4, s19, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v14, v16, v19
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v12
-; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s4
-; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
-; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v19
-; GFX11-NEXT: s_and_b32 s4, s18, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v16, v18, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT: v_add_nc_u32_e32 v19, v21, v17
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v13
-; GFX11-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19
-; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v17
-; GFX11-NEXT: v_add_f32_e64 v20, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: s_lshl_b32 s4, s18, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
-; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v18
-; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s4
-; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v17, v20, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX11-NEXT: v_bfe_u32 v19, v22, 16, 1
-; GFX11-NEXT: s_and_b32 s4, s17, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v17, v17, v20
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v16
-; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v20
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v18, v19, v22
-; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-NEXT: s_lshl_b32 s4, s17, 16
-; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v22
-; GFX11-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18
-; GFX11-NEXT: v_bfe_u32 v24, v19, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s4
-; GFX11-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-NEXT: s_and_b32 s4, s16, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v21, v24, v19
-; GFX11-NEXT: v_bfe_u32 v22, v25, 16, 1
-; GFX11-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16
-; GFX11-NEXT: v_add_f32_e64 v23, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s4, s16, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v20
-; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21
-; GFX11-NEXT: v_add_nc_u32_e32 v21, v22, v25
-; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19
-; GFX11-NEXT: v_add_f32_e64 v24, 0x40c00000, s4
-; GFX11-NEXT: v_bfe_u32 v27, v23, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21
-; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v25
-; GFX11-NEXT: s_and_b32 s4, s3, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v22, v24, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT: v_add_nc_u32_e32 v25, v27, v23
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v20
-; GFX11-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25
-; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v23
-; GFX11-NEXT: v_add_f32_e64 v26, 0x40c00000, s4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20
-; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v24
-; GFX11-NEXT: v_add_f32_e64 v28, 0x40c00000, s3
-; GFX11-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v23, v26, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v21
-; GFX11-NEXT: v_bfe_u32 v25, v28, 16, 1
-; GFX11-NEXT: s_and_b32 s3, s2, 0xffff0000
-; GFX11-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v22
-; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v26
-; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v24
-; GFX11-NEXT: v_add_nc_u32_e32 v24, v25, v28
-; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-NEXT: v_or_b32_e32 v29, 0x400000, v28
-; GFX11-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24
-; GFX11-NEXT: v_bfe_u32 v30, v25, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v31, 0x40c00000, s2
-; GFX11-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-NEXT: s_and_b32 s2, s1, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v27, v30, v25
-; GFX11-NEXT: v_bfe_u32 v28, v31, 16, 1
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v29, 0x40c00000, s2
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v26
-; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27
-; GFX11-NEXT: v_add_nc_u32_e32 v27, v28, v31
-; GFX11-NEXT: v_or_b32_e32 v28, 0x400000, v25
-; GFX11-NEXT: v_add_f32_e64 v30, 0x40c00000, s1
-; GFX11-NEXT: v_bfe_u32 v33, v29, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v24
-; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27
-; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v31
-; GFX11-NEXT: s_and_b32 s1, s0, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v28, v30, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v33, v29
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v26
-; GFX11-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31
-; GFX11-NEXT: v_or_b32_e32 v31, 0x400000, v29
-; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26
-; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v30
-; GFX11-NEXT: v_add_f32_e64 v34, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v29, v32, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v32
-; GFX11-NEXT: v_bfe_u32 v31, v34, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v27
-; GFX11-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v28
-; GFX11-NEXT: v_add_nc_u32_e32 v28, v29, v32
-; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v178
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v30
-; GFX11-NEXT: v_add_nc_u32_e32 v30, v31, v34
-; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v178
-; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28
-; GFX11-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
-; GFX11-NEXT: v_lshl_or_b32 v109, v5, 16, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30
-; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
-; GFX11-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v29
-; GFX11-NEXT: v_bfe_u32 v35, v31, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v33
-; GFX11-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v179
-; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v31
-; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v179
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v31
-; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v180
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v34, v36, 16, 1
-; GFX11-NEXT: v_bfe_u32 v33, v35, 16, 1
-; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v180
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v33, v33, v35
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32
-; GFX11-NEXT: v_bfe_u32 v36, v37, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_bfe_u32 v35, v38, 16, 1
-; GFX11-NEXT: v_lshl_or_b32 v178, v31, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v36, v37
-; GFX11-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v38
-; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v182
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37
-; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT: v_lshl_or_b32 v179, v32, 16, v34
-; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30
-; GFX11-NEXT: v_lshl_or_b32 v136, v2, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_bfe_u32 v37, v36, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v32, v37, v36
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v181
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38
-; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v181
-; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33
-; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1
-; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v180, v31, 16, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v33, v35, v37
-; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v170
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v34, v36, v38
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v37
-; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v39, v36, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_lshl_or_b32 v182, v31, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v37, v38, v35
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v39, v36
-; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v36
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v33
-; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v169
-; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v169
-; GFX11-NEXT: v_lshl_or_b32 v181, v32, 16, v33
-; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-NEXT: v_and_b32_e32 v38, 0xffff0000, v176
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo
-; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v39
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_bfe_u32 v35, v37, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v37
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v34
-; GFX11-NEXT: v_bfe_u32 v32, v36, 16, 1
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT: v_add_nc_u32_e32 v34, v35, v37
-; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v176
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v36
-; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
-; GFX11-NEXT: v_bfe_u32 v37, v38, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v49, v35, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT: v_lshl_or_b32 v170, v33, 16, v31
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v36, v49, v35
-; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v174
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36
-; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36
-; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v174
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v37, v35, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39
-; GFX11-NEXT: v_lshl_or_b32 v169, v31, 16, v32
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v37, v37, v35
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v31, v36, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT: v_and_b32_e32 v32, 0xffff0000, v171
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v177
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v31, v36
-; GFX11-NEXT: v_lshl_or_b32 v176, v33, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37
-; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v35
-; GFX11-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171
-; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v36
-; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37
-; GFX11-NEXT: v_bfe_u32 v37, v32, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v32
-; GFX11-NEXT: v_bfe_u32 v50, v38, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32
-; GFX11-NEXT: v_bfe_u32 v34, v35, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v177
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: v_bfe_u32 v49, v37, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v36, v50, v38
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v184
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37
-; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_and_b32_e32 v48, 0xffff0000, v184
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35
-; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v37
-; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v50
-; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT: v_bfe_u32 v48, v37, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v39, v38, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35
-; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36
-; GFX11-NEXT: v_lshl_or_b32 v174, v33, 16, v31
-; GFX11-NEXT: v_lshl_or_b32 v171, v32, 16, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v31, v48, v37
-; GFX11-NEXT: v_and_b32_e32 v33, 0xffff0000, v175
-; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v175
-; GFX11-NEXT: v_add_nc_u32_e32 v39, v39, v38
-; GFX11-NEXT: v_lshl_or_b32 v177, v35, 16, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v37
-; GFX11-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39
-; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v38
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1
-; GFX11-NEXT: v_bfe_u32 v39, v34, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v173
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v173
-; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v33
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v36, v37, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v37, v39, v34
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v34
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v35
-; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT: v_lshl_or_b32 v122, v3, 16, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38
-; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v48
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v35
-; GFX11-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v36, v38, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v38
-; GFX11-NEXT: v_add_f32_e32 v48, 0x40c00000, v48
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v172
-; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v172
-; GFX11-NEXT: v_add_nc_u32_e32 v36, v36, v38
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT: v_or_b32_e32 v55, 0x400000, v48
-; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-NEXT: v_add_f32_e32 v39, 0x40c00000, v39
-; GFX11-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT: v_bfe_u32 v50, v37, 16, 1
-; GFX11-NEXT: v_bfe_u32 v38, v39, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v39
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
-; GFX11-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37
-; GFX11-NEXT: v_bfe_u32 v51, v48, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v38, v38, v39
-; GFX11-NEXT: v_or_b32_e32 v53, 0x400000, v37
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49
-; GFX11-NEXT: v_bfe_u32 v52, v50, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v51, v51, v48
-; GFX11-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v35
-; GFX11-NEXT: v_add_nc_u32_e32 v52, v52, v50
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
-; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36
-; GFX11-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52
-; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v50
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v38
-; GFX11-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT: v_lshl_or_b32 v184, v32, 16, v31
-; GFX11-NEXT: v_lshl_or_b32 v175, v33, 16, v34
-; GFX11-NEXT: v_and_b32_e32 v38, 0xffff, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v48
-; GFX11-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX11-NEXT: v_lshl_or_b32 v173, v35, 16, v36
-; GFX11-NEXT: v_lshl_or_b32 v97, v8, 16, v10
-; GFX11-NEXT: v_and_b32_e32 v48, 0xffff, v48
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v37
-; GFX11-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v86, v9, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v76, v11, 16, v13
-; GFX11-NEXT: v_lshl_or_b32 v67, v14, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v172, v37, 16, v38
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v39
-; GFX11-NEXT: v_lshl_or_b32 v59, v16, 16, v19
-; GFX11-NEXT: v_lshl_or_b32 v52, v18, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v46, v21, 16, v23
-; GFX11-NEXT: v_lshl_or_b32 v41, v22, 16, v25
-; GFX11-NEXT: v_lshl_or_b32 v183, v39, 16, v48
-; GFX11-NEXT: v_lshl_or_b32 v37, v24, 16, v27
-; GFX11-NEXT: v_lshl_or_b32 v34, v26, 16, v28
-; GFX11-NEXT: v_lshl_or_b32 v32, v29, 16, v30
-; GFX11-NEXT: .LBB79_3: ; %end
-; GFX11-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46
-; GFX11-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86
-; GFX11-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76
-; GFX11-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136
-; GFX11-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122
-; GFX11-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172
-; GFX11-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175
-; GFX11-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
-; GFX11-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
-; GFX11-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
-; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: scratch_load_b32 v184, off, s32
-; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:4
-; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:8
-; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:12
-; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:16
-; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:20
-; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:24
-; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:28
-; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:32
-; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:36
-; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:40
-; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:44
-; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:48
-; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:52
-; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:56
-; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:60
-; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:64
-; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:68
-; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:72
-; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:76
-; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:80
-; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:84
-; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:88
-; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:92
-; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:96
-; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:100
-; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:104
-; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:108
-; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:112
-; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:116
-; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:120
-; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:124
-; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:128
-; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:132
-; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:136
-; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:140
-; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:144
-; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:148
-; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:152
-; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:156
-; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:160
-; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:164
-; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:168
-; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:172
-; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:176
-; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:180
-; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:184
-; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:188
-; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:192
-; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:196
-; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:200
-; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:204
-; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:208
-; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:212
-; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:216
-; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:220
-; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:224
-; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:228
-; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:232
-; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:236
-; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:240
-; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:244
-; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:248
-; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:252
-; GFX11-NEXT: s_clause 0x8
-; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:256
-; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:260
-; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:264
-; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:268
-; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:272
-; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:276
-; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:280
-; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:284
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:288
-; GFX11-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34
-; GFX11-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52
-; GFX11-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177
-; GFX11-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181
-; GFX11-NEXT: v_mov_b32_e32 v28, v182
-; GFX11-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB79_4:
-; GFX11-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
-; GFX11-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73
-; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78
-; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84
-; GFX11-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91
-; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99
-; GFX11-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108
-; GFX11-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118
-; GFX11-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129
-; GFX11-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141
-; GFX11-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154
-; GFX11-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168
-; GFX11-NEXT: s_branch .LBB79_2
+; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v16f64_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:156
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:28
+; GFX11-TRUE16-NEXT: s_clause 0x6
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v167, v13 :: v_dual_mov_b32 v176, v12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v177, v11 :: v_dual_mov_b32 v178, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v179, v9 :: v_dual_mov_b32 v180, v8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v181, v7 :: v_dual_mov_b32 v182, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v183, v5 :: v_dual_mov_b32 v168, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v169, v3 :: v_dual_mov_b32 v170, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v171, v1 :: v_dual_mov_b32 v172, v0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v174, s28 :: v_dual_mov_b32 v173, s29
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB79_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v135, s0 :: v_dual_mov_b32 v134, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v132, s2 :: v_dual_mov_b32 v129, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v125, s16 :: v_dual_mov_b32 v120, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v114, s18 :: v_dual_mov_b32 v107, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v99, s20 :: v_dual_mov_b32 v90, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v80, s22 :: v_dual_mov_b32 v69, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v57, s24 :: v_dual_mov_b32 v44, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB79_3
+; GFX11-TRUE16-NEXT: .LBB79_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s27, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s27, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s26, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s25, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s25, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xffff0000
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v8 :: v_dual_add_nc_u32 v7, v7, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s24, 16
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v7, v2 :: v_dual_add_nc_u32 v7, v8, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v9, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v3, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v1.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 16, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.h, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s23, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s22, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s22, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s21, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s20, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v90.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s20, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s19, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s18, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v107.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s17, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s17, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v120, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v120.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s16, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s3, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v125, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v125.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_and_b32 s3, s2, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s1, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v167
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v167
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v167.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v176
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v176
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v176.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v177
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v177
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v177.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v178
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v178
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v178, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v178.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v179
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v179
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v179, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v180
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v180
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v180.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v181
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v181
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v181.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v182
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v182
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v182.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v183
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v183
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v168
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v168
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v168, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v168.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v169
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v169
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v169, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v169.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v170
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v170
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v170, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v170.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v171
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v171
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v171, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v171.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v172
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v172
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v172, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v172.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v173
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v173
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v173, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v173.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v174
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v174
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v174, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v174.h, v0.l
+; GFX11-TRUE16-NEXT: .LBB79_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v125 :: v_dual_mov_b32 v5, v120
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v114 :: v_dual_mov_b32 v7, v107
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v99 :: v_dual_mov_b32 v9, v90
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, v57 :: v_dual_mov_b32 v13, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v30 :: v_dual_mov_b32 v17, v173
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v174 :: v_dual_mov_b32 v19, v171
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0x6
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:280
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v135 :: v_dual_mov_b32 v1, v134
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v132 :: v_dual_mov_b32 v3, v129
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, v80 :: v_dual_mov_b32 v11, v69
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v182 :: v_dual_mov_b32 v27, v179
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v180 :: v_dual_mov_b32 v29, v177
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v178 :: v_dual_mov_b32 v31, v167
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v30, v176
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB79_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166
+; GFX11-TRUE16-NEXT: s_branch .LBB79_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v16f64_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:276
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:272
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:268
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:264
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:260
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:256
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:252
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:248
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:244
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:240
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:236
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:232
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:228
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:224
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:220
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:216
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:212
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:208
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:204
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:200
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:196
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:192
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:188
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:184
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:180
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:176
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164
+; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:148
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:144
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:140
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:136
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:128
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:124
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:120
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:116
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:108
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:104
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:96
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:92
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:88
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v139, s32 offset:84
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v140, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v141, s32 offset:76
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v142, s32 offset:72
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v143, s32 offset:68
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v152, s32 offset:64
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v153, s32 offset:60
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v154, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v155, s32 offset:52
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v156, s32 offset:48
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36
+; GFX11-FAKE16-NEXT: s_clause 0x8
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v171, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v172, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v173, s32 offset:12
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v174, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v175, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v184, s32
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB79_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB79_3
+; GFX11-FAKE16-NEXT: .LBB79_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s27, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s27, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s26, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s25, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s25, 0xffff0000
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v183
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v9, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v3, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v10, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s24, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s23, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v7, v9
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v151, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s22, 0xffff0000
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v12, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v11, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s22, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v9, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s21, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v14, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v16, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s20, 0xffff0000
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v12, v16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s20, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-FAKE16-NEXT: v_bfe_u32 v18, v12, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s19, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v18, v12
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v19, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s19, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v16, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v21, v17, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v19
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s18, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v18, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, v21, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v13
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v20, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s18, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v18
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v22, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v20, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v22, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s17, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, v17, v20
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v20
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, v19, v22
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s17, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v22
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18
+; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v19, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v24, v19
+; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v25, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v23, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s16, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v20
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v22, v25
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v19
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v24, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v27, v23, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v25
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s3, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v24, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, v27, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v20
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v23
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v26, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v24
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v28, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v26, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT: v_bfe_u32 v25, v28, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s3, s2, 0xffff0000
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v22
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v26
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v24
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, v25, v28
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v28
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24
+; GFX11-FAKE16-NEXT: v_bfe_u32 v30, v25, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v31, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s1, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v30, v25
+; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v31, 16, 1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v29, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v26
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v28, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v25
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v30, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v29, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, 0x400000, v31
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s0, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v30, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v33, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v26
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, 0x400000, v29
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v32, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, 0x400000, v30
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v34, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v29, v32, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v34, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v28
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, v29, v32
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v178
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v30
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, v31, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v178
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v109, v5, 16, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v29
+; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v31, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v33
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v179
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v179
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v31
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v180
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v180
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v33, v35
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v178, v31, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v36, v37
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v182
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v179, v32, 16, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v136, v2, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v181
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v181
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33
+; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v180, v31, 16, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v170
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v36, v38
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v182, v31, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v38, v35
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v39, v36
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v169
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v169
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v181, v32, 16, v33
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v176
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v34
+; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v37
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v176
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v32, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v170, v33, 16, v31
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v49, v35
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v174
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v174
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v169, v31, 16, v32
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v37, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v171
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v177
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v31, v36
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v176, v33, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v32, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32
+; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v177
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v50, v38
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v184
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v184
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v50
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v37, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v174, v33, 16, v31
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v171, v32, 16, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v48, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v175
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v175
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, v39, v38
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v177, v35, 16, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v37
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v38
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v34, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v173
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v173
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v33
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v37, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v39, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v34
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v122, v3, 16, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v48
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v38
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v48
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v172
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v172
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v36, v38
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v48
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v39
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v37, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v39, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v39
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37
+; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v48, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v39
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v37
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49
+; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v50, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v51, v51, v48
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v52, v52, v50
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v50
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v38
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v184, v32, 16, v31
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v175, v33, 16, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v173, v35, 16, v36
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v97, v8, 16, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v48
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v37
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v86, v9, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v76, v11, 16, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v67, v14, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v172, v37, 16, v38
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v39
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v59, v16, 16, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v52, v18, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v46, v21, 16, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v41, v22, 16, v25
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v183, v39, 16, v48
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v24, 16, v27
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v26, 16, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v29, 16, v30
+; GFX11-FAKE16-NEXT: .LBB79_3: ; %end
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
+; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v173, off, s32 offset:12
+; GFX11-FAKE16-NEXT: scratch_load_b32 v172, off, s32 offset:16
+; GFX11-FAKE16-NEXT: scratch_load_b32 v171, off, s32 offset:20
+; GFX11-FAKE16-NEXT: scratch_load_b32 v170, off, s32 offset:24
+; GFX11-FAKE16-NEXT: scratch_load_b32 v169, off, s32 offset:28
+; GFX11-FAKE16-NEXT: scratch_load_b32 v168, off, s32 offset:32
+; GFX11-FAKE16-NEXT: scratch_load_b32 v159, off, s32 offset:36
+; GFX11-FAKE16-NEXT: scratch_load_b32 v158, off, s32 offset:40
+; GFX11-FAKE16-NEXT: scratch_load_b32 v157, off, s32 offset:44
+; GFX11-FAKE16-NEXT: scratch_load_b32 v156, off, s32 offset:48
+; GFX11-FAKE16-NEXT: scratch_load_b32 v155, off, s32 offset:52
+; GFX11-FAKE16-NEXT: scratch_load_b32 v154, off, s32 offset:56
+; GFX11-FAKE16-NEXT: scratch_load_b32 v153, off, s32 offset:60
+; GFX11-FAKE16-NEXT: scratch_load_b32 v152, off, s32 offset:64
+; GFX11-FAKE16-NEXT: scratch_load_b32 v143, off, s32 offset:68
+; GFX11-FAKE16-NEXT: scratch_load_b32 v142, off, s32 offset:72
+; GFX11-FAKE16-NEXT: scratch_load_b32 v141, off, s32 offset:76
+; GFX11-FAKE16-NEXT: scratch_load_b32 v140, off, s32 offset:80
+; GFX11-FAKE16-NEXT: scratch_load_b32 v139, off, s32 offset:84
+; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:88
+; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:92
+; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:96
+; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:100
+; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:104
+; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:108
+; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:112
+; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116
+; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120
+; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124
+; GFX11-FAKE16-NEXT: s_clause 0x1f
+; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128
+; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132
+; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136
+; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:140
+; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:144
+; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:148
+; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:152
+; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:156
+; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:160
+; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:164
+; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:168
+; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:172
+; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:176
+; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:180
+; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:184
+; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:188
+; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:192
+; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:196
+; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:200
+; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:204
+; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:208
+; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:212
+; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:216
+; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:220
+; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:224
+; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:228
+; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:232
+; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:236
+; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:240
+; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244
+; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248
+; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252
+; GFX11-FAKE16-NEXT: s_clause 0x8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256
+; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260
+; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264
+; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:268
+; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:272
+; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:276
+; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:280
+; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:284
+; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:288
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v28, v182
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB79_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168
+; GFX11-FAKE16-NEXT: s_branch .LBB79_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -154990,9 +158070,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8
; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e64 v5, 0xffff, s5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_and_b32 v1, 0xff, v35
; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
@@ -155008,6 +158089,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32
; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
@@ -155018,201 +158100,169 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v68
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v33
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v64
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v66
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v4, v67
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v65
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v38
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v6, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v66
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v37
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v70
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v34
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v65
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v118
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v67
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v38
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v68
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v69
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v39
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v70
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v50
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v71
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v48
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v69
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v82
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v7, v80
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v8, v81
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v9, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v10, 16, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v84
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v54
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v86
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v83
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v49
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v80
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v3, v82
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v55
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v81
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v2, v71
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v51
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v53
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v83
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v3, v86
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v1, v84
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v96
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v85
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v10, v97
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v87
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v99
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v103
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v114
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v98
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v0, 16, v12
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v96
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v85
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v98
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v87
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v97
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v102
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v103
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v101
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v100
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v113
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v101
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v116
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v14, v128
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v114
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v1, v113
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v117
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v112
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v117
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v102
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v130
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v133
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v14, v132
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v0, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v116
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v128
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v134
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v132
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v133
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v3, v130
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v161
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v129
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v147
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v148
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v118
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v129
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v161
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v2, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v166
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v144
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v134
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v147
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v167
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v17, 16, v19
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v18, 16, v22
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v166
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v144
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v167
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v151
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v149
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v20, 16, v21
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v180
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v177
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v180
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v149
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v177
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v165
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v162
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v42
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v41
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v42
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v41
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v178
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v115
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v45
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v44
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v45
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v44
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v131
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v119
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v59
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v56
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v59
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v119
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v56
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v135
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v60
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v61
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v60
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v135
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v61
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v150
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v63
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v62
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v63
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v146
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v62
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v160
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v73
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v72
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v73
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v160
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v72
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v176
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v164
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v75
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v74
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v75
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v164
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v74
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v181
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v179
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v77
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v76
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v77
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v179
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v76
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v78
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v79
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v78
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v182
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v79
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v43
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v40
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v89
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v88
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v89
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v40
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v88
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v47
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v46
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v91
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v90
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v91
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v90
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v58
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v92
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v93
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v92
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v57
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v93
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s5
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB89_3
; GFX11-TRUE16-NEXT: .LBB89_2: ; %cmp.true
@@ -155252,57 +158302,59 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10
; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0
; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2
+; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v57
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v58
-; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v57
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v47
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v46
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v92, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v46
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v93, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v92, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v91, v2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v43
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v93, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v90, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v40
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v43, 0x300, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v90, v3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v183
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v182
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v182, 0x300, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v89, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v181
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v88, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v181, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v78, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v79, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v181, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v179
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v182, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v179, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v77, v3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
@@ -155311,7 +158363,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v164
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v163
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v163, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v76, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
@@ -155322,18 +158374,18 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v74, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v73, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v150
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v150, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v72, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v146
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v145
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v135
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v135, 0x300, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v63, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
@@ -155341,13 +158393,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v131
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v62, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v60, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v61, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v119
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v135, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v59, v3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
@@ -155356,29 +158408,29 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v115
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v165
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v115, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v56, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v162
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v45, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v115, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v44, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v42, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v151
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v41, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v149
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v148
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v144
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v180, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
@@ -155392,8 +158444,8 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v133, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v129
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v161, v3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v118
@@ -155401,167 +158453,141 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v117
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v116
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v116, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v147, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v114
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v99
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v134, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v132, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v130, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v103
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v98
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v54
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v103
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v103, 0x300, v0
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v98
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v128, v3
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v99
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v54
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v39
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v52
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v113, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 3, v35
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 3, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v113, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v128, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v100
-; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v101, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v102, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v101, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v102, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v96
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v134, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v97, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v55
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v96
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v97, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v55
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v100
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xff, v33
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v87, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v51
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v86, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v51, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v85, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v84, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v52
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v50
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v50, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v83, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v48
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v49
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v39
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v39, 0x300, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v87, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v82, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v38
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v81, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v71, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v80, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v51
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v86, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v85, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v84, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v51, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v50
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v50, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v49
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v83, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v82, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v81, v5
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v38, 0x300, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v71, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v80, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v37
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v39, 0x300, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v70, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v34
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v35
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v69, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v34
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v112, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v68, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v67, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v66, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v32
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v65, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e64 v8, 0xffff, s4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v70, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v36
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v36
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v37
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v22
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 3, v32
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v69, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v34
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v35
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v32
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v112, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v67, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v68, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, v66, v34
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v32, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v38
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v34, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v39, 16, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v50, 16, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v9, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v15, 16, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v116
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v129
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v36
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v114, 16, v32
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v144, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v115
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v135
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v131
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v145, 16, v32
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v119, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v163
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v182
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v181
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v2, 16, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v65, v33
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v34
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v36.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v32
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v33.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v0.l
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v133, 16, v19
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v160, 16, v32
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v179, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v103.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v114.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v18.h, v129.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v19.h, v133.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v20.h, v144.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v21.h, v145.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v115.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v119.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v24.h, v131.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v25.h, v135.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v26.h, v150.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v27.h, v160.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v28.h, v179.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v29.h, v181.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.h, v182.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v43.l
; GFX11-TRUE16-NEXT: .LBB89_3: ; %end
; GFX11-TRUE16-NEXT: s_clause 0x1e
; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:320
@@ -161654,179 +164680,182 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:236
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:200
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:192
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:184
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:176
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:168
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:160
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:152
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:144
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:136
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:112
-; GFX11-TRUE16-NEXT: s_clause 0x18
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1b
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:12
; GFX11-TRUE16-NEXT: s_clause 0x2
; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_b32 v99, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_b32 v98, off, s32
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr178_hi16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v80, off, s32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr181_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr152_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr177_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr180_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr143_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr141_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr183_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr140_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr139_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr40_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr138_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr179_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr137_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr56_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr126_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr182_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr127_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr125_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr41_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr123_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr40_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr121_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr79_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr111_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr60_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr72_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr109_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr46_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr107_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr106_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr95_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr104_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr92_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr76_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr93_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr74_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr127_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr89_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr79_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr104_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr106_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr75_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr142_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr73_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr125_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr63_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr62_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr139_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr61_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr143_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr58_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr141_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr59_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr155_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr57_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr154_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr47_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr124_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr44_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr142_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr122_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr110_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr138_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr137_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr126_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr108_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr94_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr124_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr122_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr92_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr90_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr88_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr110_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr108_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr95_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr74_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr72_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr94_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr93_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr90_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr62_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr59_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr57_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr46_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr44_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr88_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr167_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr76_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr43_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr41_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr183_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr73_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr63_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr177_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr60_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr176_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr182_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr167_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr181_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr180_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr58_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr56_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr179_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr178_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr43_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
@@ -161835,136 +164864,136 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB90_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[100:101], 24, v[15:16]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[84:85], 24, v[27:28]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[101:102], 24, v[13:14]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[96:97], 24, v[15:16]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[99:100], 24, v[13:14]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[114:115], 24, v[11:12]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[117:118], 24, v[9:10]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[130:131], 24, v[7:8]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[144:145], 24, v[3:4]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[85:86], 24, v[25:26]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 24, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v73, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 8, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 8, v13
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 24, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v78, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v89, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v91, 24, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v93, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v95, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 24, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v109, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v111, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v121, 24, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v123, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v126, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v136, 24, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v137, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v138, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v139, 24, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v140, 8, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v77, 8, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v79, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v89, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v91, 8, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v92, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v109, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v111, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v121, 8, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v123, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v125, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v127, 8, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v140, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v141, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v143, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v152, 8, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 24, v99
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 8, v99
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v42, 24, v81
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 8, v81
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 8, v98
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 24, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v41, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 24, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 8, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 8, v27
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 24, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v74, 24, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v77, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 24, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v92, 8, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v21
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 24, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v108, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v110, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v120, 24, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v122, 8, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v124, 8, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[131:132], 24, v[5:6]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 8, v80
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v73, 8, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v76, 8, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v78, 24, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v93, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v95, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v108, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v110, 8, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v120, 8, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v122, 24, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v124, 8, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v126, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v137, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v138, 8, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v142, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[128:129], 24, v[7:8]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[134:135], 24, v[5:6]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[145:146], 24, v[1:2]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[98:99]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[29:30]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[86:87], 24, v[23:24]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[102:103], 24, v[21:22]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[115:116], 24, v[19:20]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[118:119], 24, v[17:18]
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v178.h, v1.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[82:83], 24, v[80:81]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[86:87], 24, v[29:30]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[100:101], 24, v[27:28]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[97:98], 24, v[25:26]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[112:113], 24, v[23:24]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[115:116], 24, v[21:22]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[118:119], 24, v[19:20]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[132:133], 24, v[17:18]
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v181.h, v1.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v177.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v180.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.h, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v40.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.h, v3.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.h, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v182.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.h, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v56.h, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.h, v5.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.h, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v42.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v40.h, v6.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.h, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v79.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.h, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v60.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.h, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v106.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.h, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v76.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.h, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v127.h, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v112.h, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v104.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v142.h, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v125.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v14.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v143.h, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v141.h, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v16.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.h, v17.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v72.h, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.h, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v46.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.h, v8.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v104.h, v9.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.h, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v74.h, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.h, v10.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v136.h, v11.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v102.h, v11.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v106.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v153.h, v13.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v13.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v139.h, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v14.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v155.h, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.h, v15.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v154.h, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v16.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v17.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v17.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v18.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.h, v19.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.h, v20.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v20.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.h, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v161.h, v21.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v160.h, v22.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v22.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v160.h, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v163.h, v23.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v162.h, v24.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v24.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v162.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v165.h, v25.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v161.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v164.h, v26.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v26.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v164.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v167.h, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v163.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v166.h, v28.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v28.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v166.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v177.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v165.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v176.h, v30.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v30.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v176.h, v98.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.h, v98.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v167.h, v99.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v99.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.h, v80.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.h, v80.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v178.h, v81.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v81.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5
@@ -161980,7 +165009,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81
; GFX11-TRUE16-NEXT: .LBB90_2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB90_4
@@ -162019,10 +165048,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v37, v48, v17, 0x7fff
; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v120, 24, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v122, 8, v32
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v137, 24, v32
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v138, 8, v32
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v135, v37, v49, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v149, v37, v49, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v19
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19
@@ -162036,97 +165065,101 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v20, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v18
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v135.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v149.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v36, 16, 1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v147, v33, v35, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v150, v33, v35, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v19
; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v36, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v36
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v124, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[132:133], 24, v[31:32]
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v17, v34, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v19, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v34.l, v147.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v142, 8, v31
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v19, 0x7fff
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 24, v34
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v108, 8, v34
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v148, v17, v33, vcc_lo
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v151, v17, v33 :: v_dual_and_b32 v18, 0xffff0000, v22
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v20, v35 :: v_dual_and_b32 v18, 0xffff0000, v22
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_cndmask_b32 v33, v20, v35
; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v21
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v18
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 0x40c00000, v21 :: v_dual_add_f32 v22, 0x40c00000, v22
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v20, 0x40c00000, v20 :: v_dual_add_f32 v21, 0x40c00000, v21
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v18, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v34.l, v150.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v33.l, v151.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v22, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v22
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v122, 24, v34
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v124, 8, v34
; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v22, 0x7fff
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v149, v19, v35 :: v_dual_lshlrev_b32 v22, 16, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v126, 8, v33
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v160, v19, v35, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v24
; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v21
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v17, v36, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v21, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v36.l, v149.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v21, 0x7fff
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v23
; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v20, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v20
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v150, v17, v24, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v161, v17, v24, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v20, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v23
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v22
; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 24, v36
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v36.l, v160.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v19, v35, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v22, 16, 1
; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v33.l, v148.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v92, 8, v36
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v35.l, v161.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v22, 0x7fff
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v26
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v110, 8, v33
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v151, v19, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v120, 8, v35
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v162, v19, v23, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v20, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v21
; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v20, 0x7fff
; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v18
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v18, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v26
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v38, v17, v24, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v21, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v20
; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v38.l, v151.h
; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v21, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v160, v17, v23 :: v_dual_lshlrev_b32 v21, 16, v25
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v108, 24, v36
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v110, 8, v36
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v163, v17, v23, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v22
@@ -162139,8 +165172,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v18
; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v22, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v161, v19, v23 :: v_dual_lshlrev_b32 v22, 16, v28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v28
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v37.l, v163.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v164, v19, v23, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v20, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v21
@@ -162153,10 +165188,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v21, 0x7fff
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v27
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v49.l, v161.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v35.l, v150.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v162, v17, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v95, 8, v37
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v165, v17, v23, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v22
@@ -162169,10 +165203,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v22, 0x7fff
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 24, v49
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 8, v49
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v35
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v163, v19, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v38.l, v162.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v48.l, v165.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v166, v19, v23, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v20, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v21
@@ -162185,10 +165219,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v21, 0x7fff
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v29
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v51.l, v163.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v74, 24, v38
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v77, 8, v38
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v164, v17, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[112:113], 24, v[37:38]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v93, 24, v38
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v38
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v167, v17, v23, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v22
@@ -162201,14 +165235,14 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v22, 0x7fff
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v99
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 24, v51
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 8, v51
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v165, v19, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v81
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v50.l, v167.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v176, v19, v23, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v20, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v99
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v81
; GFX11-TRUE16-NEXT: v_dual_add_f32 v22, 0x40c00000, v22 :: v_dual_cndmask_b32 v53, v17, v24
; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v21, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
@@ -162217,14 +165251,14 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v21, 0x7fff
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v98
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v37.l, v160.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v53.l, v165.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v166, v17, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v80
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v49.l, v164.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v177, v17, v23, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v98
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v80
; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 0x40c00000, v21 :: v_dual_cndmask_b32 v52, v19, v24
; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v22, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
@@ -162233,10 +165267,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v22, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v21
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 24, v53
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v41, 8, v53
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v37
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v167, v19, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v52.l, v177.h
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v78, 24, v49
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v49
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v178, v19, v23, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v20, 16, 1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v2
@@ -162249,10 +165283,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v21, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v55.l, v167.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v176, v17, v22, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v179, v17, v22, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v1
@@ -162263,13 +165296,12 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v18, 0x7fff
; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v20, 0x40c00000, v20
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v2, 0x7fff
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 24, v55
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 8, v55
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v54.l, v179.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v20, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v177, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v180, v19, v21, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v4
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
@@ -162282,11 +165314,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v1, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v48.l, v162.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v65.l, v177.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 8, v48
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v18, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v178, v17, v19, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v181, v17, v19, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
@@ -162301,9 +165332,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v139, 24, v65
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v140, 8, v65
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v179, v2, v19, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v51.l, v166.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v64.l, v181.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v182, v2, v19, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v17, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v3
@@ -162313,13 +165344,13 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v17, 0x7fff
; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v67.l, v179.h
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 24, v51
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v50.l, v164.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v136, 24, v67
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v40, v1, v18, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v53.l, v176.h
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v73, 8, v51
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v183, v1, v18, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v4, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v6
@@ -162330,13 +165361,13 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v4, 0x7fff
; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[84:85], 24, v[50:51]
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v66.l, v183.h
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v6, 0x7fff
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[85:86], 24, v[48:49]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[86:87], 24, v[37:38]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v137, 8, v67
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v42, v2, v17, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 24, v53
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v53
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v76, 8, v50
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v40, v2, v17, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v5, 16, 1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v8
@@ -162350,28 +165381,27 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v3
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v6, 16, 1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 8, v50
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v56, v2, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v41, v2, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v5, v6, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 8, v48
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v68, v1, v17, vcc_lo
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v2, v4, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v7
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v56.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v41.h
; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v60, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v46, v3, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v7
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v10
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_cndmask_b32 v83, v1, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_cndmask_b32 v71, v1, v8
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
@@ -162380,29 +165410,29 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v6, 16, 1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v60.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v52.l, v166.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v82, v1, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v46.h
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[86:87], 24, v[52:53]
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v70, v1, v7, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v5, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v9
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v6, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v79, v4, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v72, v4, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v12
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v11
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v79.h
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v76, v1, v4 :: v_dual_lshlrev_b32 v1, 16, v9
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v55.l, v178.h
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v74, v1, v4 :: v_dual_lshlrev_b32 v1, 16, v9
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v97, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v85, v2, v7, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
@@ -162410,40 +165440,40 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v13
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.l, v76.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v54.l, v176.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[130:131], 24, v[82:83]
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v96, v2, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v67.l, v182.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, v74.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v72.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v84, v2, v6, vcc_lo
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v7, v1, 0x7fff
; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[54:55]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[52:53]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[128:129], 24, v[70:71]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[82:83], 24, v[54:55]
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v6, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v106, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v104, v2, v3, vcc_lo
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v5
; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v8
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v7, v4, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v14
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.l, v106.h
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v104, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v104.h
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v106, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v11
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v6, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[117:118], 24, v[96:97]
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v65.l, v180.h
; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v6, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v113, v3, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v103, v3, v7, vcc_lo
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v6
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v13
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.l, v104.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[118:119], 24, v[31:32]
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v112, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.l, v106.h
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[117:118], 24, v[84:85]
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v102, v2, v3, vcc_lo
; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v4 :: v_dual_add_f32 v3, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v7, v1, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
@@ -162452,8 +165482,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v64.l, v178.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v127, v4, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[118:119], 24, v[33:34]
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v136, v4, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
@@ -162461,19 +165491,19 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v1, 16, 1
; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v9
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v129, v4, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v131, v4, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v8, v1, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v16
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v15
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v125, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v139, v6, v7, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v15
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v40.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v128, v3, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v139.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v130, v3, v4, vcc_lo
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v16
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v6, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
@@ -162481,11 +165511,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, v125.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v112.l, v127.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v102.l, v136.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v40.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v5
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v142, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v153, v4, v6, vcc_lo
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v7, v1, 0x7fff
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
@@ -162494,389 +165524,322 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v8
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, v142.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v141, v2, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v153.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v154, v2, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v42.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[114:115], 24, v[112:113]
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v143, v7, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[99:100], 24, v[130:131]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[114:115], 24, v[102:103]
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v155, v7, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v10, v6, 0x7fff
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[144:145], 24, v[66:67]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[131:132], 24, v[68:69]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[134:135], 24, v[68:69]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[145:146], 24, v[64:65]
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v134, v4, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v148, v4, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v141.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[115:116], 24, v[33:34]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 24, v129
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 8, v129
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v133, v2, v3, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.l, v143.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 24, v134
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 8, v134
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v73, 8, v128
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 24, v113
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[100:101], 24, v[133:134]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[101:102], 24, v[128:129]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[102:103], 24, v[35:36]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v133
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v78, 8, v113
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v89, 8, v112
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v91, 24, v97
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v93, 8, v97
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v95, 8, v96
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 24, v83
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v109, 8, v83
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v111, 8, v82
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v121, 24, v69
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v123, 8, v69
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v126, 8, v68
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v138, 8, v66
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.l, v154.h
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[100:101], 24, v[50:51]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[115:116], 24, v[35:36]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 24, v131
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v147, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.l, v155.h
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 24, v148
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 8, v148
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 8, v131
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 8, v130
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[96:97], 24, v[147:148]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[97:98], 24, v[48:49]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 8, v147
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 24, v103
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v77, 8, v103
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v79, 8, v102
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v89, 24, v85
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v91, 8, v85
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v92, 8, v84
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 24, v71
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 8, v71
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v109, 8, v70
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v111, 24, v69
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v121, 8, v69
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v123, 8, v68
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v125, 24, v67
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v127, 8, v67
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v140, 8, v66
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v141, 24, v65
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v143, 8, v65
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v152, 8, v64
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 8, v54
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 8, v52
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v42, 24, v55
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 8, v55
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 8, v54
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 8, v52
; GFX11-TRUE16-NEXT: .LBB90_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v178.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v181.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v152.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v64.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v145.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v65.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.l, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v139.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v180.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v143.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v65.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v141.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v183.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v140.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v144.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v1.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v177.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v140.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v66.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v144.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v67.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v2.l, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v40.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v138.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v136.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v182.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v127.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v125.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v41.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v123.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v68.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v131.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v179.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v137.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v69.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v134.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v40.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v121.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v82.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.l, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v56.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v126.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v130.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v83.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v107.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v5.l, v5.h
; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v42.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v123.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v117.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v5, v6
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v97.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v79.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v111.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v91.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v112.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v5, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v114.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v60.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v109.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v113.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v75.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v5, v8
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v128.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v106.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v95.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v101.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v129.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v5, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v61.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v7.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v69.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v111.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v72.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v109.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v70.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v128.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v46.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v107.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v105.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.h, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v9.h, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v10.h, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v104.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v92.l
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v85.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v89.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v136.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v79.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v102.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v114.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v106.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v77.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v103.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v75.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v84.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v117.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v74.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v91.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v5.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v6.h, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v153.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v62.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v130.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v99.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v139.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v61.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v131.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v59.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v155.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v57.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v10.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v147.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v96.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v154.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v47.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v148.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v44.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v149.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v142.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v132.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.h, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v138.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v137.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v151.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v126.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v118.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v150.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v124.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.l, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v76.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v93.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v133.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v100.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v5, v10
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v134.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v127.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v89.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v45.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v5, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v118.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v104.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v78.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v120.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v5, v12
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v14.l, v14.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v142.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v73.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v115.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v5, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v105.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v15.l, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v125.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v63.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v102.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v5, v14
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v15.l, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v13.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v14.l, v14.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v122.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v161.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v120.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v35.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v115.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v160.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v110.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v108.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.h, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v14.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v15.h, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v16.h, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v163.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v95.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v112.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v162.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v94.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v38.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v93.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v165.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v90.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h
; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v143.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v58.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v90.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v5, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v86.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v17.l, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v141.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v47.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v74.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v5, v16
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v17.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v18.l, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v135.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v124.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v85.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v49.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v5, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v59.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v18.l, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v19.l, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v122.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v50.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v84.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v5, v18
-; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v51.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v19.l, v19.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v20.l, v20.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v148.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v110.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v44.l
-; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v5, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v80.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v20.l, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v17.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v18.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v19.l, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v48.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v97.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v164.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v88.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v49.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v78.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v167.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v76.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v50.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v100.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.h, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v19.h, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v20.h, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v21.h, v22.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v166.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v73.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v51.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v63.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v177.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v60.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v52.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v86.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v176.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v58.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h
; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v21.l, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v147.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v108.l
-; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v53.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v183.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v5, v20
-; GFX11-TRUE16-NEXT: v_and_b16 v33.l, 0xff, v54.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v21.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v22.l, v22.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v150.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v94.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_and_b16 v34.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v5, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v180.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v22.l, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v23.l, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v149.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v92.l
-; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[6:9], off offset:16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v5, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v23.l, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v24.l, v24.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v160.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v88.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v5, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v24.l, v24.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v25.l, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v151.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v77.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v5, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v25.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v26.l, v26.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v162.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v72.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v5, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v26.l, v26.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v27.l, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v161.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v62.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v5, v26
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v27.l, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v28.l, v28.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v164.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v57.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v5, v27
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v28.l, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v29.l, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v163.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v46.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v5, v28
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v29.l, v29.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v30.l, v30.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v166.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v43.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v5, v29
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v30.l, v30.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v31.l, v31.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v165.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v41.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v5, v30
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v31.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v32.l, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v176.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v182.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v5, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v32.l, v32.h
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v33.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v33.l, 0xff, v167.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v181.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v5, v32
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v33.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v33.h, v34.l, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v5.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, v5, v33
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v22.l, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v23.l, v23.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v24.l, v24.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v53.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v56.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v179.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v45.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v54.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v82.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v178.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 8, v43.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v55.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v42.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.h, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v24.h, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v25.h, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v26.h, v27.l
; GFX11-TRUE16-NEXT: s_clause 0x5
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[10:13], off offset:32
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[14:17], off offset:48
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[18:21], off offset:64
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[22:25], off offset:80
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[26:29], off offset:96
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[30:33], off offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:136
-; GFX11-TRUE16-NEXT: s_clause 0x18
-; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:144
-; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:152
-; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:160
-; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:168
-; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:176
-; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:184
-; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:192
-; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:200
-; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:136
+; GFX11-TRUE16-NEXT: s_clause 0x1b
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:248
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -169227,1575 +172190,3138 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v64bf16_to_v128i8_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_or_saveexec_b32 s4, -1
-; GFX11-NEXT: s_clause 0x3
-; GFX11-NEXT: scratch_store_b32 off, v40, s32
-; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
-; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:8
-; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:12
-; GFX11-NEXT: s_mov_b32 exec_lo, s4
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v41, s96, 0
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15
-; GFX11-NEXT: v_readfirstlane_b32 s72, v1
-; GFX11-NEXT: v_readfirstlane_b32 s73, v2
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
-; GFX11-NEXT: v_writelane_b32 v41, s97, 1
-; GFX11-NEXT: v_readfirstlane_b32 s62, v3
-; GFX11-NEXT: v_readfirstlane_b32 s63, v4
-; GFX11-NEXT: v_readfirstlane_b32 s60, v5
-; GFX11-NEXT: v_writelane_b32 v40, s34, 2
-; GFX11-NEXT: v_writelane_b32 v41, s98, 2
-; GFX11-NEXT: v_readfirstlane_b32 s61, v6
-; GFX11-NEXT: v_readfirstlane_b32 s58, v7
-; GFX11-NEXT: v_readfirstlane_b32 s59, v8
-; GFX11-NEXT: v_writelane_b32 v40, s35, 3
-; GFX11-NEXT: v_writelane_b32 v41, s99, 3
-; GFX11-NEXT: v_readfirstlane_b32 s56, v9
-; GFX11-NEXT: v_readfirstlane_b32 s57, v10
-; GFX11-NEXT: v_readfirstlane_b32 s46, v11
-; GFX11-NEXT: v_writelane_b32 v40, s36, 4
-; GFX11-NEXT: v_writelane_b32 v41, s100, 4
-; GFX11-NEXT: v_readfirstlane_b32 s47, v12
-; GFX11-NEXT: v_readfirstlane_b32 s44, v13
-; GFX11-NEXT: v_readfirstlane_b32 s45, v14
-; GFX11-NEXT: v_writelane_b32 v40, s37, 5
-; GFX11-NEXT: v_writelane_b32 v41, s101, 5
-; GFX11-NEXT: s_mov_b32 vcc_hi, 0
-; GFX11-NEXT: s_and_b32 s4, vcc_lo, exec_lo
-; GFX11-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
-; GFX11-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
-; GFX11-NEXT: v_writelane_b32 v40, s38, 6
-; GFX11-NEXT: v_writelane_b32 v41, s102, 6
-; GFX11-NEXT: v_writelane_b32 v40, s39, 7
-; GFX11-NEXT: v_writelane_b32 v41, s103, 7
-; GFX11-NEXT: v_writelane_b32 v40, s48, 8
-; GFX11-NEXT: v_writelane_b32 v41, s104, 8
-; GFX11-NEXT: v_writelane_b32 v40, s49, 9
-; GFX11-NEXT: v_writelane_b32 v40, s50, 10
-; GFX11-NEXT: v_writelane_b32 v40, s51, 11
-; GFX11-NEXT: v_writelane_b32 v40, s52, 12
-; GFX11-NEXT: v_writelane_b32 v40, s53, 13
-; GFX11-NEXT: v_writelane_b32 v40, s54, 14
-; GFX11-NEXT: v_writelane_b32 v40, s55, 15
-; GFX11-NEXT: v_writelane_b32 v40, s64, 16
-; GFX11-NEXT: v_writelane_b32 v40, s65, 17
-; GFX11-NEXT: v_writelane_b32 v40, s66, 18
-; GFX11-NEXT: v_writelane_b32 v40, s67, 19
-; GFX11-NEXT: v_writelane_b32 v40, s68, 20
-; GFX11-NEXT: v_writelane_b32 v40, s69, 21
-; GFX11-NEXT: v_writelane_b32 v40, s70, 22
-; GFX11-NEXT: v_writelane_b32 v40, s71, 23
-; GFX11-NEXT: v_writelane_b32 v40, s80, 24
-; GFX11-NEXT: v_writelane_b32 v40, s81, 25
-; GFX11-NEXT: v_writelane_b32 v40, s82, 26
-; GFX11-NEXT: v_writelane_b32 v40, s83, 27
-; GFX11-NEXT: v_writelane_b32 v40, s84, 28
-; GFX11-NEXT: v_writelane_b32 v40, s85, 29
-; GFX11-NEXT: v_writelane_b32 v40, s86, 30
-; GFX11-NEXT: v_writelane_b32 v40, s87, 31
-; GFX11-NEXT: s_cbranch_scc0 .LBB91_3
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: s_lshr_b32 s4, s27, 24
-; GFX11-NEXT: s_lshr_b64 s[12:13], s[26:27], 24
-; GFX11-NEXT: v_writelane_b32 v43, s4, 15
-; GFX11-NEXT: s_lshr_b32 s4, s27, 16
-; GFX11-NEXT: s_lshr_b32 s99, s2, 16
-; GFX11-NEXT: s_lshr_b32 s100, s2, 8
-; GFX11-NEXT: s_lshr_b32 s101, s1, 24
-; GFX11-NEXT: v_writelane_b32 v43, s4, 14
-; GFX11-NEXT: s_lshr_b32 s4, s27, 8
-; GFX11-NEXT: s_lshr_b32 s11, s1, 16
-; GFX11-NEXT: s_lshr_b32 s102, s1, 8
-; GFX11-NEXT: s_lshr_b32 s103, s0, 16
-; GFX11-NEXT: v_writelane_b32 v43, s4, 16
-; GFX11-NEXT: s_lshr_b32 s4, s26, 16
-; GFX11-NEXT: s_lshr_b32 s104, s0, 8
-; GFX11-NEXT: s_lshr_b32 s85, s45, 24
-; GFX11-NEXT: s_lshr_b32 s10, s45, 16
-; GFX11-NEXT: v_writelane_b32 v43, s4, 17
-; GFX11-NEXT: s_lshr_b32 s4, s26, 8
-; GFX11-NEXT: s_lshr_b32 s5, s45, 8
-; GFX11-NEXT: s_lshr_b32 s87, s44, 16
-; GFX11-NEXT: s_lshr_b32 s86, s44, 8
-; GFX11-NEXT: v_writelane_b32 v43, s4, 18
-; GFX11-NEXT: s_lshr_b32 s4, s25, 24
-; GFX11-NEXT: s_lshr_b32 s81, s47, 24
-; GFX11-NEXT: s_lshr_b32 s98, s47, 16
-; GFX11-NEXT: s_lshr_b32 s84, s47, 8
-; GFX11-NEXT: v_writelane_b32 v43, s4, 19
-; GFX11-NEXT: s_lshr_b32 s4, s25, 16
-; GFX11-NEXT: s_lshr_b32 s48, s46, 8
-; GFX11-NEXT: s_lshr_b32 s70, s57, 24
-; GFX11-NEXT: s_lshr_b32 s97, s57, 16
-; GFX11-NEXT: v_writelane_b32 v43, s4, 13
-; GFX11-NEXT: s_lshr_b32 s4, s25, 8
-; GFX11-NEXT: s_lshr_b32 s80, s57, 8
-; GFX11-NEXT: s_lshr_b32 s83, s56, 16
-; GFX11-NEXT: s_lshr_b32 s82, s56, 8
-; GFX11-NEXT: v_writelane_b32 v43, s4, 20
-; GFX11-NEXT: s_lshr_b32 s4, s24, 16
-; GFX11-NEXT: s_lshr_b32 s66, s59, 24
-; GFX11-NEXT: s_lshr_b32 s9, s59, 16
-; GFX11-NEXT: s_lshr_b32 s69, s59, 8
-; GFX11-NEXT: v_writelane_b32 v43, s4, 21
-; GFX11-NEXT: s_lshr_b32 s4, s24, 8
-; GFX11-NEXT: s_lshr_b32 s71, s58, 16
-; GFX11-NEXT: s_lshr_b32 s39, s58, 8
-; GFX11-NEXT: s_lshr_b32 s55, s61, 24
-; GFX11-NEXT: v_writelane_b32 v43, s4, 22
-; GFX11-NEXT: s_lshr_b32 s4, s23, 24
-; GFX11-NEXT: s_lshr_b32 s8, s61, 16
-; GFX11-NEXT: s_lshr_b32 s65, s61, 8
-; GFX11-NEXT: s_lshr_b32 s68, s60, 16
-; GFX11-NEXT: v_writelane_b32 v43, s4, 23
-; GFX11-NEXT: s_lshr_b32 s4, s23, 16
-; GFX11-NEXT: s_lshr_b32 s67, s60, 8
-; GFX11-NEXT: s_lshr_b32 s51, s63, 24
-; GFX11-NEXT: s_lshr_b32 s96, s63, 16
-; GFX11-NEXT: v_writelane_b32 v43, s4, 12
-; GFX11-NEXT: s_lshr_b32 s4, s23, 8
-; GFX11-NEXT: s_lshr_b32 s54, s63, 8
-; GFX11-NEXT: s_lshr_b32 s38, s62, 16
-; GFX11-NEXT: s_lshr_b32 s64, s62, 8
-; GFX11-NEXT: v_writelane_b32 v43, s4, 24
-; GFX11-NEXT: s_lshr_b32 s4, s22, 16
-; GFX11-NEXT: s_lshr_b32 s36, s73, 24
-; GFX11-NEXT: s_lshr_b32 s7, s73, 16
-; GFX11-NEXT: s_lshr_b32 s50, s73, 8
-; GFX11-NEXT: v_writelane_b32 v43, s4, 25
-; GFX11-NEXT: s_lshr_b32 s4, s22, 8
-; GFX11-NEXT: s_lshr_b32 s53, s72, 16
-; GFX11-NEXT: s_lshr_b32 s52, s72, 8
-; GFX11-NEXT: s_lshr_b32 s34, s29, 24
-; GFX11-NEXT: v_writelane_b32 v43, s4, 26
-; GFX11-NEXT: s_lshr_b32 s4, s21, 24
-; GFX11-NEXT: s_lshr_b32 s6, s29, 16
-; GFX11-NEXT: s_lshr_b32 s35, s29, 8
-; GFX11-NEXT: s_lshr_b32 s37, s28, 16
-; GFX11-NEXT: v_writelane_b32 v43, s4, 27
-; GFX11-NEXT: s_lshr_b32 s4, s21, 16
-; GFX11-NEXT: s_lshr_b32 s49, s28, 8
-; GFX11-NEXT: s_lshr_b64 s[14:15], s[16:17], 24
-; GFX11-NEXT: s_lshr_b64 s[40:41], s[2:3], 24
-; GFX11-NEXT: v_writelane_b32 v43, s4, 11
-; GFX11-NEXT: s_lshr_b32 s4, s21, 8
-; GFX11-NEXT: s_lshr_b64 s[42:43], s[0:1], 24
-; GFX11-NEXT: s_lshr_b64 s[74:75], s[44:45], 24
-; GFX11-NEXT: s_lshr_b64 s[76:77], s[46:47], 24
-; GFX11-NEXT: v_writelane_b32 v43, s4, 28
-; GFX11-NEXT: s_lshr_b32 s4, s20, 16
-; GFX11-NEXT: s_lshr_b64 s[78:79], s[56:57], 24
-; GFX11-NEXT: s_lshr_b64 s[88:89], s[58:59], 24
-; GFX11-NEXT: s_lshr_b64 s[90:91], s[60:61], 24
-; GFX11-NEXT: v_writelane_b32 v43, s4, 29
-; GFX11-NEXT: s_lshr_b32 s4, s20, 8
-; GFX11-NEXT: s_lshr_b64 s[92:93], s[62:63], 24
-; GFX11-NEXT: s_lshr_b64 s[94:95], s[72:73], 24
-; GFX11-NEXT: s_lshr_b64 s[30:31], s[28:29], 24
-; GFX11-NEXT: v_writelane_b32 v43, s4, 30
-; GFX11-NEXT: s_lshr_b32 s4, s19, 24
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_writelane_b32 v43, s4, 31
-; GFX11-NEXT: s_lshr_b32 s4, s19, 16
-; GFX11-NEXT: v_writelane_b32 v43, s4, 10
-; GFX11-NEXT: s_lshr_b32 s4, s19, 8
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_writelane_b32 v42, s4, 0
-; GFX11-NEXT: s_lshr_b32 s4, s18, 16
-; GFX11-NEXT: v_writelane_b32 v42, s4, 1
-; GFX11-NEXT: s_lshr_b32 s4, s18, 8
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_writelane_b32 v42, s4, 2
-; GFX11-NEXT: s_lshr_b32 s4, s17, 24
-; GFX11-NEXT: v_writelane_b32 v42, s4, 3
-; GFX11-NEXT: s_lshr_b32 s4, s17, 16
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_writelane_b32 v43, s4, 9
-; GFX11-NEXT: s_lshr_b32 s4, s17, 8
-; GFX11-NEXT: v_writelane_b32 v42, s4, 4
-; GFX11-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_writelane_b32 v42, s4, 5
-; GFX11-NEXT: s_lshr_b32 s4, s16, 8
-; GFX11-NEXT: v_writelane_b32 v42, s4, 6
-; GFX11-NEXT: s_lshr_b32 s4, s3, 24
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_writelane_b32 v42, s4, 7
-; GFX11-NEXT: s_lshr_b32 s4, s3, 16
-; GFX11-NEXT: v_writelane_b32 v43, s4, 8
-; GFX11-NEXT: s_lshr_b32 s4, s3, 8
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_writelane_b32 v42, s4, 8
-; GFX11-NEXT: s_lshr_b32 s4, s46, 16
-; GFX11-NEXT: v_writelane_b32 v43, s12, 6
-; GFX11-NEXT: v_writelane_b32 v43, s13, 7
-; GFX11-NEXT: s_lshr_b64 s[12:13], s[24:25], 24
-; GFX11-NEXT: v_writelane_b32 v43, s12, 4
-; GFX11-NEXT: v_writelane_b32 v43, s13, 5
-; GFX11-NEXT: s_lshr_b64 s[12:13], s[22:23], 24
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_writelane_b32 v43, s12, 2
-; GFX11-NEXT: v_writelane_b32 v43, s13, 3
-; GFX11-NEXT: s_lshr_b64 s[12:13], s[20:21], 24
-; GFX11-NEXT: v_writelane_b32 v43, s12, 0
-; GFX11-NEXT: v_writelane_b32 v43, s13, 1
-; GFX11-NEXT: s_lshr_b64 s[12:13], s[18:19], 24
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_hi
-; GFX11-NEXT: s_cbranch_vccnz .LBB91_4
-; GFX11-NEXT: .LBB91_2: ; %cmp.true
-; GFX11-NEXT: s_and_b32 s4, s29, 0xffff0000
-; GFX11-NEXT: s_and_b32 s14, s47, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
-; GFX11-NEXT: s_and_b32 s4, s1, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s15, s47, 16
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s6, s29, 16
-; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s6
-; GFX11-NEXT: s_and_b32 s8, s45, 0xffff0000
-; GFX11-NEXT: v_readfirstlane_b32 s47, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT: s_lshl_b32 s7, s45, 16
-; GFX11-NEXT: s_and_b32 s78, s28, 0xffff0000
-; GFX11-NEXT: s_bfe_u32 s6, s47, 0x10010
-; GFX11-NEXT: s_lshl_b32 s79, s28, 16
-; GFX11-NEXT: s_add_i32 s45, s6, s47
-; GFX11-NEXT: s_and_b32 s5, s73, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s77, s73, 16
-; GFX11-NEXT: s_and_b32 s75, s72, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s76, s72, 16
-; GFX11-NEXT: s_and_b32 s11, s63, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s74, s63, 16
-; GFX11-NEXT: s_and_b32 s72, s62, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s73, s62, 16
-; GFX11-NEXT: s_and_b32 s63, s61, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s62, s61, 16
-; GFX11-NEXT: s_and_b32 s61, s60, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s60, s60, 16
-; GFX11-NEXT: s_and_b32 s41, s59, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s40, s59, 16
-; GFX11-NEXT: s_and_b32 s28, s58, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s29, s58, 16
-; GFX11-NEXT: s_and_b32 s13, s57, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s10, s57, 16
-; GFX11-NEXT: s_and_b32 s42, s56, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s43, s56, 16
-; GFX11-NEXT: s_and_b32 s12, s46, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s9, s46, 16
-; GFX11-NEXT: s_and_b32 s4, s44, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s6, s44, 16
-; GFX11-NEXT: s_addk_i32 s45, 0x7fff
-; GFX11-NEXT: s_bitset1_b32 s47, 22
-; GFX11-NEXT: v_bfe_u32 v4, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: s_and_b32 s44, vcc_lo, exec_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: s_cselect_b32 s44, s47, s45
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v2
-; GFX11-NEXT: s_lshr_b32 s58, s44, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s1
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s78
-; GFX11-NEXT: v_readfirstlane_b32 s1, v3
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s79
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: s_bfe_u32 s45, s1, 0x10010
-; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1
-; GFX11-NEXT: s_add_i32 s45, s45, s1
-; GFX11-NEXT: s_bitset1_b32 s1, 22
-; GFX11-NEXT: s_addk_i32 s45, 0x7fff
-; GFX11-NEXT: s_and_b32 s44, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s1, s1, s45
-; GFX11-NEXT: s_and_b32 s44, s0, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v2
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s44
-; GFX11-NEXT: v_bfe_u32 v5, v7, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v4, v6
-; GFX11-NEXT: s_lshr_b32 s1, s1, 16
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v7
-; GFX11-NEXT: v_readfirstlane_b32 s44, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: s_bfe_u32 s45, s44, 0x10010
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_add_i32 s45, s45, s44
-; GFX11-NEXT: s_bitset1_b32 s44, 22
-; GFX11-NEXT: s_addk_i32 s45, 0x7fff
-; GFX11-NEXT: s_and_b32 s46, vcc_lo, exec_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v5, v7
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v6
-; GFX11-NEXT: s_cselect_b32 s44, s44, s45
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v21
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s5
-; GFX11-NEXT: v_readfirstlane_b32 s0, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s77
-; GFX11-NEXT: s_bfe_u32 s5, s0, 0x10010
-; GFX11-NEXT: v_lshl_or_b32 v7, v22, 16, v4
-; GFX11-NEXT: s_add_i32 s45, s5, s0
-; GFX11-NEXT: s_lshr_b32 s5, s44, 16
-; GFX11-NEXT: s_addk_i32 s45, 0x7fff
-; GFX11-NEXT: s_bitset1_b32 s0, 22
-; GFX11-NEXT: s_and_b32 s44, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s0, s0, s45
-; GFX11-NEXT: s_and_b32 s44, s3, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v1
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s44
-; GFX11-NEXT: v_bfe_u32 v6, v8, 16, 1
-; GFX11-NEXT: v_bfe_u32 v1, v5, 16, 1
-; GFX11-NEXT: s_lshr_b32 s0, s0, 16
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v23
-; GFX11-NEXT: v_readfirstlane_b32 s44, v9
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v5
-; GFX11-NEXT: v_lshl_or_b32 v6, v2, 16, v3
-; GFX11-NEXT: s_bfe_u32 s45, s44, 0x10010
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v5
-; GFX11-NEXT: s_add_i32 s45, s45, s44
-; GFX11-NEXT: s_bitset1_b32 s44, 22
-; GFX11-NEXT: s_addk_i32 s45, 0x7fff
-; GFX11-NEXT: s_and_b32 s46, vcc_lo, exec_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT: s_cselect_b32 s44, s44, s45
-; GFX11-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s3
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s76
-; GFX11-NEXT: s_lshr_b32 s59, s44, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s75
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_readfirstlane_b32 s3, v10
-; GFX11-NEXT: v_bfe_u32 v8, v9, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v87, 24, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1
-; GFX11-NEXT: s_bfe_u32 s45, s3, 0x10010
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: s_add_i32 s45, s45, s3
-; GFX11-NEXT: s_bitset1_b32 s3, 22
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4
-; GFX11-NEXT: s_addk_i32 s45, 0x7fff
-; GFX11-NEXT: s_and_b32 s44, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s3, s3, s45
-; GFX11-NEXT: s_and_b32 s44, s2, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v1
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s44
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v8, v9
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v9
-; GFX11-NEXT: v_readfirstlane_b32 s44, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: s_lshr_b32 s3, s3, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v4
-; GFX11-NEXT: s_bfe_u32 s45, s44, 0x10010
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v24
-; GFX11-NEXT: s_add_i32 s45, s45, s44
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: s_addk_i32 s45, 0x7fff
-; GFX11-NEXT: s_bitset1_b32 s44, 22
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s74
-; GFX11-NEXT: v_lshl_or_b32 v14, v25, 16, v5
-; GFX11-NEXT: s_and_b32 s46, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s44, s44, s45
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s2
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v85, 24, v14
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_readfirstlane_b32 s2, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v3
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s11
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: s_bfe_u32 s11, s2, 0x10010
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_add_i32 s45, s11, s2
-; GFX11-NEXT: s_lshr_b32 s11, s44, 16
-; GFX11-NEXT: s_addk_i32 s45, 0x7fff
-; GFX11-NEXT: s_bitset1_b32 s2, 22
-; GFX11-NEXT: s_and_b32 s44, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s2, s2, s45
-; GFX11-NEXT: s_and_b32 s44, s17, 0xffff0000
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v26
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s44
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1
-; GFX11-NEXT: s_lshr_b32 s2, s2, 16
-; GFX11-NEXT: v_lshl_or_b32 v13, v2, 16, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX11-NEXT: v_readfirstlane_b32 s44, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v3
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v86, 16, v13
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX11-NEXT: s_bfe_u32 s45, s44, 0x10010
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_add_i32 s45, s45, s44
-; GFX11-NEXT: s_bitset1_b32 s44, 22
-; GFX11-NEXT: s_addk_i32 s45, 0x7fff
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
-; GFX11-NEXT: s_and_b32 s46, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s44, s44, s45
-; GFX11-NEXT: s_lshl_b32 s17, s17, 16
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s73
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s17
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v1
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s72
-; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT: v_readfirstlane_b32 s17, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: s_lshr_b32 s72, s44, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v2
-; GFX11-NEXT: s_bfe_u32 s45, s17, 0x10010
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v27
-; GFX11-NEXT: s_add_i32 s45, s45, s17
-; GFX11-NEXT: s_bitset1_b32 s17, 22
-; GFX11-NEXT: s_addk_i32 s45, 0x7fff
-; GFX11-NEXT: s_and_b32 s44, vcc_lo, exec_lo
-; GFX11-NEXT: v_lshl_or_b32 v16, v28, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1
-; GFX11-NEXT: s_cselect_b32 s17, s17, s45
-; GFX11-NEXT: s_and_b32 s44, s16, 0xffff0000
-; GFX11-NEXT: s_lshr_b32 s17, s17, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s63
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 24, v16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v2
-; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v5, v3
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v29
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v8, v1
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s44
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_readfirstlane_b32 s44, v8
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: s_bfe_u32 s45, s44, 0x10010
-; GFX11-NEXT: s_add_i32 s45, s45, s44
-; GFX11-NEXT: s_bitset1_b32 s44, 22
-; GFX11-NEXT: s_addk_i32 s45, 0x7fff
-; GFX11-NEXT: s_and_b32 s46, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s44, s44, s45
-; GFX11-NEXT: s_lshl_b32 s16, s16, 16
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s16
-; GFX11-NEXT: s_lshr_b32 s46, s44, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_readfirstlane_b32 s16, v8
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s62
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: s_bfe_u32 s45, s16, 0x10010
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_add_i32 s45, s45, s16
-; GFX11-NEXT: s_bitset1_b32 s16, 22
-; GFX11-NEXT: s_addk_i32 s45, 0x7fff
-; GFX11-NEXT: s_and_b32 s44, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s16, s16, s45
-; GFX11-NEXT: s_and_b32 s44, s19, 0xffff0000
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s44
-; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v5
-; GFX11-NEXT: v_bfe_u32 v9, v4, 16, 1
-; GFX11-NEXT: s_lshr_b32 s16, s16, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v8, vcc_lo
-; GFX11-NEXT: v_readfirstlane_b32 s44, v10
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v4
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s60
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v4
-; GFX11-NEXT: s_bfe_u32 s45, s44, 0x10010
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s61
-; GFX11-NEXT: s_add_i32 s45, s45, s44
-; GFX11-NEXT: s_bitset1_b32 s44, 22
-; GFX11-NEXT: s_addk_i32 s45, 0x7fff
-; GFX11-NEXT: s_and_b32 s47, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s44, s44, s45
-; GFX11-NEXT: s_lshl_b32 s19, s19, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v9
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s19
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_bfe_u32 v9, v8, 16, 1
-; GFX11-NEXT: s_lshr_b32 s60, s44, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v1
-; GFX11-NEXT: v_readfirstlane_b32 s19, v10
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: v_bfe_u32 v3, v5, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v9, v8
-; GFX11-NEXT: s_bfe_u32 s45, s19, 0x10010
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v2
-; GFX11-NEXT: s_add_i32 s45, s45, s19
-; GFX11-NEXT: s_bitset1_b32 s19, 22
-; GFX11-NEXT: s_addk_i32 s45, 0x7fff
-; GFX11-NEXT: s_and_b32 s44, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s19, s19, s45
-; GFX11-NEXT: s_and_b32 s44, s18, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v3, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v8
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s44
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: s_lshr_b32 s19, s19, 16
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s29
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s41
-; GFX11-NEXT: v_readfirstlane_b32 s41, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_pack_ll_b32_b16 s47, s17, s72
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v2
-; GFX11-NEXT: v_bfe_u32 v2, v3, 16, 1
-; GFX11-NEXT: s_bfe_u32 s44, s41, 0x10010
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: s_add_i32 s44, s44, s41
-; GFX11-NEXT: s_bitset1_b32 s41, 22
-; GFX11-NEXT: s_addk_i32 s44, 0x7fff
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s40
-; GFX11-NEXT: s_and_b32 s45, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s41, s41, s44
-; GFX11-NEXT: s_lshl_b32 s18, s18, 16
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v31
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s18
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v3
-; GFX11-NEXT: v_lshl_or_b32 v18, v30, 16, v4
-; GFX11-NEXT: v_readfirstlane_b32 s18, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_lshl_or_b32 v17, v1, 16, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3
-; GFX11-NEXT: s_bfe_u32 s40, s18, 0x10010
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s28
-; GFX11-NEXT: s_add_i32 s44, s40, s18
-; GFX11-NEXT: s_lshr_b32 s40, s41, 16
-; GFX11-NEXT: s_addk_i32 s44, 0x7fff
-; GFX11-NEXT: s_bitset1_b32 s18, 22
-; GFX11-NEXT: s_and_b32 s41, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s18, s18, s44
-; GFX11-NEXT: s_and_b32 s41, s21, 0xffff0000
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s41
-; GFX11-NEXT: v_bfe_u32 v2, v9, 16, 1
-; GFX11-NEXT: s_lshr_b32 s18, s18, 16
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v9
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
-; GFX11-NEXT: v_readfirstlane_b32 s28, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v9
-; GFX11-NEXT: v_bfe_u32 v4, v8, 16, 1
-; GFX11-NEXT: v_bfe_u32 v5, v10, 16, 1
-; GFX11-NEXT: s_bfe_u32 s29, s28, 0x10010
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v1
-; GFX11-NEXT: s_add_i32 s29, s29, s28
-; GFX11-NEXT: s_bitset1_b32 s28, 22
-; GFX11-NEXT: s_addk_i32 s29, 0x7fff
-; GFX11-NEXT: s_and_b32 s41, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s28, s28, s29
-; GFX11-NEXT: s_lshl_b32 s21, s21, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s21
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: s_lshr_b32 s61, s28, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v5, v10
-; GFX11-NEXT: s_pack_ll_b32_b16 s44, s2, s11
-; GFX11-NEXT: v_readfirstlane_b32 s21, v11
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v4, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: s_bfe_u32 s29, s21, 0x10010
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v2
-; GFX11-NEXT: s_add_i32 s29, s29, s21
-; GFX11-NEXT: s_bitset1_b32 s21, 22
-; GFX11-NEXT: s_addk_i32 s29, 0x7fff
-; GFX11-NEXT: s_and_b32 s28, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s21, s21, s29
-; GFX11-NEXT: s_and_b32 s28, s20, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v8
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s28
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v10
-; GFX11-NEXT: s_lshr_b32 s21, s21, 16
-; GFX11-NEXT: s_pack_ll_b32_b16 s45, s3, s59
-; GFX11-NEXT: s_pack_ll_b32_b16 s46, s16, s46
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s13
-; GFX11-NEXT: v_readfirstlane_b32 s13, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 24, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_bfe_u32 s28, s13, 0x10010
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v34
-; GFX11-NEXT: s_add_i32 s28, s28, s13
-; GFX11-NEXT: s_bitset1_b32 s13, 22
-; GFX11-NEXT: s_addk_i32 s28, 0x7fff
-; GFX11-NEXT: s_and_b32 s29, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s13, s13, s28
-; GFX11-NEXT: s_lshl_b32 s20, s20, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v1
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s20
-; GFX11-NEXT: v_bfe_u32 v1, v3, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s10
-; GFX11-NEXT: v_lshl_or_b32 v20, v33, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v35
-; GFX11-NEXT: v_readfirstlane_b32 s20, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT: v_lshl_or_b32 v19, v2, 16, v9
-; GFX11-NEXT: s_bfe_u32 s10, s20, 0x10010
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: s_add_i32 s28, s10, s20
-; GFX11-NEXT: s_lshr_b32 s10, s13, 16
-; GFX11-NEXT: s_addk_i32 s28, 0x7fff
-; GFX11-NEXT: s_bitset1_b32 s20, 22
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v3
-; GFX11-NEXT: s_and_b32 s13, vcc_lo, exec_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: s_cselect_b32 s13, s20, s28
-; GFX11-NEXT: s_and_b32 s20, s23, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s42
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s20
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s43
-; GFX11-NEXT: v_readfirstlane_b32 s28, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v19
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: s_bfe_u32 s20, s28, 0x10010
-; GFX11-NEXT: v_bfe_u32 v4, v8, 16, 1
-; GFX11-NEXT: s_add_i32 s29, s20, s28
-; GFX11-NEXT: s_lshr_b32 s20, s13, 16
-; GFX11-NEXT: s_addk_i32 s29, 0x7fff
-; GFX11-NEXT: s_bitset1_b32 s28, 22
-; GFX11-NEXT: s_and_b32 s13, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s13, s28, s29
-; GFX11-NEXT: s_lshl_b32 s23, s23, 16
-; GFX11-NEXT: v_bfe_u32 v5, v9, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s23
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v4, v8
-; GFX11-NEXT: s_lshr_b32 s62, s13, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v5, v9
-; GFX11-NEXT: v_readfirstlane_b32 s23, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: s_bfe_u32 s28, s23, 0x10010
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v9
-; GFX11-NEXT: s_add_i32 s28, s28, s23
-; GFX11-NEXT: s_bitset1_b32 s23, 22
-; GFX11-NEXT: s_addk_i32 s28, 0x7fff
-; GFX11-NEXT: s_and_b32 s13, vcc_lo, exec_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: s_cselect_b32 s13, s23, s28
-; GFX11-NEXT: s_and_b32 s23, s22, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s15
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v36
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s23
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s14
-; GFX11-NEXT: s_lshr_b32 s23, s13, 16
-; GFX11-NEXT: v_bfe_u32 v9, v8, 16, 1
-; GFX11-NEXT: v_readfirstlane_b32 s14, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_lshl_or_b32 v71, v37, 16, v4
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s12
-; GFX11-NEXT: s_bfe_u32 s15, s14, 0x10010
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1
-; GFX11-NEXT: s_add_i32 s15, s15, s14
-; GFX11-NEXT: s_bitset1_b32 s14, 22
-; GFX11-NEXT: s_addk_i32 s15, 0x7fff
-; GFX11-NEXT: s_and_b32 s13, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s13, s14, s15
-; GFX11-NEXT: s_lshl_b32 s14, s22, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s14
-; GFX11-NEXT: v_bfe_u32 v1, v5, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v38
-; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v8
-; GFX11-NEXT: s_lshr_b32 s13, s13, 16
-; GFX11-NEXT: v_readfirstlane_b32 s14, v10
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v5
-; GFX11-NEXT: v_lshl_or_b32 v70, v2, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v9
-; GFX11-NEXT: s_bfe_u32 s12, s14, 0x10010
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v8
-; GFX11-NEXT: s_add_i32 s12, s12, s14
-; GFX11-NEXT: s_bitset1_b32 s14, 22
-; GFX11-NEXT: s_addk_i32 s12, 0x7fff
-; GFX11-NEXT: s_and_b32 s15, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s12, s14, s12
-; GFX11-NEXT: s_and_b32 s14, s25, 0xffff0000
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s14
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s9
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-NEXT: v_readfirstlane_b32 s9, v10
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_lshr_b32 s22, s12, 16
-; GFX11-NEXT: v_bfe_u32 v3, v4, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v2
-; GFX11-NEXT: s_bfe_u32 s14, s9, 0x10010
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: s_add_i32 s14, s14, s9
-; GFX11-NEXT: s_bitset1_b32 s9, 22
-; GFX11-NEXT: s_addk_i32 s14, 0x7fff
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT: s_and_b32 s12, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s9, s9, s14
-; GFX11-NEXT: s_lshl_b32 s12, s25, 16
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s8
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s12
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v3, v4
-; GFX11-NEXT: s_lshr_b32 s63, s9, 16
-; GFX11-NEXT: v_bfe_u32 v3, v8, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v4
-; GFX11-NEXT: v_readfirstlane_b32 s8, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v8
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v8
-; GFX11-NEXT: s_bfe_u32 s12, s8, 0x10010
-; GFX11-NEXT: v_bfe_u32 v12, v9, 16, 1
-; GFX11-NEXT: s_add_i32 s12, s12, s8
-; GFX11-NEXT: s_bitset1_b32 s8, 22
-; GFX11-NEXT: s_addk_i32 s12, 0x7fff
-; GFX11-NEXT: s_and_b32 s9, vcc_lo, exec_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: s_cselect_b32 s8, s8, s12
-; GFX11-NEXT: s_and_b32 s9, s24, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: s_lshr_b32 s25, s8, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v10, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s9
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s7
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v12, v9
-; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s6
-; GFX11-NEXT: v_readfirstlane_b32 s7, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v9
-; GFX11-NEXT: s_pack_ll_b32_b16 s28, s0, s5
-; GFX11-NEXT: s_bfe_u32 s9, s7, 0x10010
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v3
-; GFX11-NEXT: s_add_i32 s9, s9, s7
-; GFX11-NEXT: s_bitset1_b32 s7, 22
-; GFX11-NEXT: s_addk_i32 s9, 0x7fff
-; GFX11-NEXT: s_and_b32 s8, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s7, s7, s9
-; GFX11-NEXT: s_lshl_b32 s8, s24, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s8
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s4
-; GFX11-NEXT: v_bfe_u32 v4, v8, 16, 1
-; GFX11-NEXT: s_lshr_b32 s12, s7, 16
-; GFX11-NEXT: v_readfirstlane_b32 s8, v10
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v8
-; GFX11-NEXT: v_bfe_u32 v10, v12, 16, 1
-; GFX11-NEXT: s_bfe_u32 s4, s8, 0x10010
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v2
-; GFX11-NEXT: s_add_i32 s4, s4, s8
-; GFX11-NEXT: s_bitset1_b32 s8, 22
-; GFX11-NEXT: s_addk_i32 s4, 0x7fff
-; GFX11-NEXT: s_and_b32 s6, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s4, s8, s4
-; GFX11-NEXT: s_and_b32 s6, s27, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
-; GFX11-NEXT: v_add_f32_e64 v52, 0x40c00000, s6
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v12
-; GFX11-NEXT: s_lshr_b32 s24, s4, 16
-; GFX11-NEXT: v_readfirstlane_b32 s6, v52
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v9
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX11-NEXT: v_bfe_u32 v4, v9, 16, 1
-; GFX11-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: s_add_i32 s7, s7, s6
-; GFX11-NEXT: s_bitset1_b32 s6, 22
-; GFX11-NEXT: s_addk_i32 s7, 0x7fff
-; GFX11-NEXT: s_and_b32 s4, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s4, s6, s7
-; GFX11-NEXT: s_lshl_b32 s6, s27, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v4, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v12
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: s_lshr_b32 s73, s4, 16
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v49
-; GFX11-NEXT: v_readfirstlane_b32 s6, v8
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v51
-; GFX11-NEXT: v_lshl_or_b32 v66, v1, 16, v11
-; GFX11-NEXT: s_bfe_u32 s7, s6, 0x10010
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: s_add_i32 s7, s7, s6
-; GFX11-NEXT: s_bitset1_b32 s6, 22
-; GFX11-NEXT: s_addk_i32 s7, 0x7fff
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_and_b32 s4, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s4, s6, s7
-; GFX11-NEXT: s_and_b32 s6, s26, 0xffff0000
-; GFX11-NEXT: s_lshr_b32 s27, s4, 16
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s6
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v52
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v39
-; GFX11-NEXT: v_lshl_or_b32 v55, v50, 16, v4
-; GFX11-NEXT: s_pack_ll_b32_b16 s8, s22, s13
-; GFX11-NEXT: v_readfirstlane_b32 s6, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_lshl_or_b32 v54, v2, 16, v8
-; GFX11-NEXT: v_lshl_or_b32 v67, v48, 16, v5
-; GFX11-NEXT: v_lshrrev_b64 v[8:9], 24, v[17:18]
-; GFX11-NEXT: s_bfe_u32 s5, s6, 0x10010
-; GFX11-NEXT: v_lshrrev_b64 v[9:10], 24, v[15:16]
-; GFX11-NEXT: s_add_i32 s5, s5, s6
-; GFX11-NEXT: s_bitset1_b32 s6, 22
-; GFX11-NEXT: s_addk_i32 s5, 0x7fff
-; GFX11-NEXT: s_and_b32 s4, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s14, s6, s5
-; GFX11-NEXT: s_lshl_b32 s4, s26, 16
-; GFX11-NEXT: s_pack_ll_b32_b16 s6, s20, s10
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
-; GFX11-NEXT: s_lshr_b32 s13, s14, 16
-; GFX11-NEXT: v_lshrrev_b64 v[10:11], 24, v[13:14]
-; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[6:7]
-; GFX11-NEXT: s_pack_ll_b32_b16 s29, s1, s58
-; GFX11-NEXT: v_readfirstlane_b32 s11, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_lshrrev_b64 v[1:2], 24, v[54:55]
-; GFX11-NEXT: v_lshrrev_b64 v[2:3], 24, v[66:67]
-; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[70:71]
-; GFX11-NEXT: s_bfe_u32 s10, s11, 0x10010
-; GFX11-NEXT: v_lshrrev_b64 v[4:5], 24, v[19:20]
-; GFX11-NEXT: s_add_i32 s10, s10, s11
-; GFX11-NEXT: s_bitset1_b32 s11, 22
-; GFX11-NEXT: s_addk_i32 s10, 0x7fff
-; GFX11-NEXT: s_and_b32 s14, vcc_lo, exec_lo
-; GFX11-NEXT: s_cselect_b32 s10, s11, s10
-; GFX11-NEXT: s_pack_ll_b32_b16 s5, s19, s60
-; GFX11-NEXT: s_lshr_b32 s26, s10, 16
-; GFX11-NEXT: s_pack_ll_b32_b16 s4, s18, s40
-; GFX11-NEXT: s_pack_ll_b32_b16 s9, s23, s62
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v55
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 8, v55
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v54
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 8, v54
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 24, v67
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 8, v67
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v66
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 8, v66
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 24, v71
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 8, v71
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v70
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 8, v70
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 24, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 8, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 8, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 8, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v17, 8, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v16, 8, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v15, 8, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v14, 8, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 8, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 8, v6
-; GFX11-NEXT: s_pack_ll_b32_b16 s7, s21, s61
-; GFX11-NEXT: s_pack_ll_b32_b16 s11, s25, s63
-; GFX11-NEXT: s_pack_ll_b32_b16 s57, s27, s73
-; GFX11-NEXT: s_pack_ll_b32_b16 s56, s26, s13
-; GFX11-NEXT: s_pack_ll_b32_b16 s10, s24, s12
-; GFX11-NEXT: s_lshr_b64 s[94:95], s[8:9], 24
-; GFX11-NEXT: s_lshr_b64 s[12:13], s[4:5], 24
-; GFX11-NEXT: s_lshr_b64 s[14:15], s[46:47], 24
-; GFX11-NEXT: s_lshr_b64 s[40:41], s[44:45], 24
-; GFX11-NEXT: s_lshr_b64 s[42:43], s[28:29], 24
-; GFX11-NEXT: s_lshr_b64 vcc, s[56:57], 24
-; GFX11-NEXT: s_lshr_b64 s[34:35], s[10:11], 24
-; GFX11-NEXT: s_lshr_b64 s[30:31], s[6:7], 24
-; GFX11-NEXT: s_lshr_b32 s13, s57, 24
-; GFX11-NEXT: s_lshr_b32 s15, s57, 8
-; GFX11-NEXT: s_lshr_b32 s41, s56, 16
-; GFX11-NEXT: s_lshr_b32 s43, s56, 8
-; GFX11-NEXT: s_lshr_b32 s56, s11, 24
-; GFX11-NEXT: s_lshr_b32 s11, s11, 8
-; GFX11-NEXT: s_lshr_b32 s57, s10, 16
-; GFX11-NEXT: s_lshr_b32 s10, s10, 8
-; GFX11-NEXT: s_lshr_b32 s74, s9, 24
-; GFX11-NEXT: s_lshr_b32 s9, s9, 8
-; GFX11-NEXT: s_lshr_b32 s75, s8, 16
-; GFX11-NEXT: s_lshr_b32 s8, s8, 8
-; GFX11-NEXT: s_lshr_b32 s76, s7, 24
-; GFX11-NEXT: s_lshr_b32 s77, s7, 8
-; GFX11-NEXT: s_lshr_b32 s78, s6, 16
-; GFX11-NEXT: s_lshr_b32 s79, s6, 8
-; GFX11-NEXT: s_lshr_b32 s88, s5, 24
-; GFX11-NEXT: s_lshr_b32 s89, s5, 8
-; GFX11-NEXT: s_lshr_b32 s90, s4, 16
-; GFX11-NEXT: s_lshr_b32 s91, s4, 8
-; GFX11-NEXT: s_lshr_b32 s92, s47, 24
-; GFX11-NEXT: s_lshr_b32 s47, s47, 8
-; GFX11-NEXT: s_lshr_b32 s93, s46, 16
-; GFX11-NEXT: s_lshr_b32 s46, s46, 8
-; GFX11-NEXT: s_lshr_b32 s95, s45, 24
-; GFX11-NEXT: s_lshr_b32 s45, s45, 8
-; GFX11-NEXT: s_lshr_b32 s99, s44, 16
-; GFX11-NEXT: s_lshr_b32 s100, s44, 8
-; GFX11-NEXT: s_lshr_b32 s101, s29, 24
-; GFX11-NEXT: s_lshr_b32 s102, s29, 8
-; GFX11-NEXT: s_lshr_b32 s103, s28, 16
-; GFX11-NEXT: s_lshr_b32 s104, s28, 8
-; GFX11-NEXT: s_branch .LBB91_5
-; GFX11-NEXT: .LBB91_3:
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr74
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr104
-; GFX11-NEXT: ; implicit-def: $sgpr103
-; GFX11-NEXT: ; implicit-def: $sgpr42
-; GFX11-NEXT: ; implicit-def: $sgpr102
-; GFX11-NEXT: ; implicit-def: $sgpr11
-; GFX11-NEXT: ; implicit-def: $sgpr101
-; GFX11-NEXT: ; implicit-def: $sgpr100
-; GFX11-NEXT: ; implicit-def: $sgpr99
-; GFX11-NEXT: ; implicit-def: $sgpr40
-; GFX11-NEXT: ; implicit-def: $sgpr14
-; GFX11-NEXT: ; implicit-def: $sgpr12
-; GFX11-NEXT: ; implicit-def: $sgpr49
-; GFX11-NEXT: ; implicit-def: $sgpr37
-; GFX11-NEXT: ; implicit-def: $sgpr35
-; GFX11-NEXT: ; implicit-def: $sgpr6
-; GFX11-NEXT: ; implicit-def: $sgpr34
-; GFX11-NEXT: ; implicit-def: $sgpr52
-; GFX11-NEXT: ; implicit-def: $sgpr53
-; GFX11-NEXT: ; implicit-def: $sgpr50
-; GFX11-NEXT: ; implicit-def: $sgpr7
-; GFX11-NEXT: ; implicit-def: $sgpr36
-; GFX11-NEXT: ; implicit-def: $sgpr64
-; GFX11-NEXT: ; implicit-def: $sgpr38
-; GFX11-NEXT: ; implicit-def: $sgpr54
-; GFX11-NEXT: ; implicit-def: $sgpr96
-; GFX11-NEXT: ; implicit-def: $sgpr51
-; GFX11-NEXT: ; implicit-def: $sgpr67
-; GFX11-NEXT: ; implicit-def: $sgpr68
-; GFX11-NEXT: ; implicit-def: $sgpr65
-; GFX11-NEXT: ; implicit-def: $sgpr8
-; GFX11-NEXT: ; implicit-def: $sgpr55
-; GFX11-NEXT: ; implicit-def: $sgpr39
-; GFX11-NEXT: ; implicit-def: $sgpr71
-; GFX11-NEXT: ; implicit-def: $sgpr69
-; GFX11-NEXT: ; implicit-def: $sgpr9
-; GFX11-NEXT: ; implicit-def: $sgpr66
-; GFX11-NEXT: ; implicit-def: $sgpr82
-; GFX11-NEXT: ; implicit-def: $sgpr83
-; GFX11-NEXT: ; implicit-def: $sgpr80
-; GFX11-NEXT: ; implicit-def: $sgpr97
-; GFX11-NEXT: ; implicit-def: $sgpr70
-; GFX11-NEXT: ; implicit-def: $sgpr48
-; GFX11-NEXT: ; implicit-def: $sgpr84
-; GFX11-NEXT: ; implicit-def: $sgpr98
-; GFX11-NEXT: ; implicit-def: $sgpr81
-; GFX11-NEXT: ; implicit-def: $sgpr86
-; GFX11-NEXT: ; implicit-def: $sgpr87
-; GFX11-NEXT: ; implicit-def: $sgpr10
-; GFX11-NEXT: ; implicit-def: $sgpr85
-; GFX11-NEXT: ; implicit-def: $sgpr30
-; GFX11-NEXT: ; implicit-def: $sgpr94
-; GFX11-NEXT: ; implicit-def: $sgpr92
-; GFX11-NEXT: ; implicit-def: $sgpr90
-; GFX11-NEXT: ; implicit-def: $sgpr88
-; GFX11-NEXT: ; implicit-def: $sgpr78
-; GFX11-NEXT: ; implicit-def: $sgpr76
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: v_writelane_b32 v43, s4, 0
-; GFX11-NEXT: v_writelane_b32 v43, s5, 1
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: v_writelane_b32 v43, s4, 2
-; GFX11-NEXT: v_writelane_b32 v43, s5, 3
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: v_writelane_b32 v43, s74, 4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: v_writelane_b32 v43, s75, 5
-; GFX11-NEXT: ; implicit-def: $sgpr74
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; kill: killed $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr5
-; GFX11-NEXT: v_writelane_b32 v43, s74, 6
-; GFX11-NEXT: v_writelane_b32 v43, s75, 7
-; GFX11-NEXT: ; implicit-def: $sgpr74
-; GFX11-NEXT: s_branch .LBB91_2
-; GFX11-NEXT: .LBB91_4:
-; GFX11-NEXT: v_dual_mov_b32 v10, s94 :: v_dual_mov_b32 v11, s30
-; GFX11-NEXT: v_readlane_b32 s94, v43, 2
-; GFX11-NEXT: v_dual_mov_b32 v96, s37 :: v_dual_mov_b32 v87, s34
-; GFX11-NEXT: v_dual_mov_b32 v6, s49 :: v_dual_mov_b32 v7, s35
-; GFX11-NEXT: v_readlane_b32 s95, v43, 3
-; GFX11-NEXT: v_readlane_b32 vcc_lo, v43, 6
-; GFX11-NEXT: v_readlane_b32 s30, v43, 0
-; GFX11-NEXT: v_readlane_b32 s34, v43, 4
-; GFX11-NEXT: v_dual_mov_b32 v52, s44 :: v_dual_mov_b32 v51, s45
-; GFX11-NEXT: v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v49, s46
-; GFX11-NEXT: v_dual_mov_b32 v39, s47 :: v_dual_mov_b32 v48, s98
-; GFX11-NEXT: v_dual_mov_b32 v38, s56 :: v_dual_mov_b32 v37, s97
-; GFX11-NEXT: v_dual_mov_b32 v36, s57 :: v_dual_mov_b32 v35, s58
-; GFX11-NEXT: v_dual_mov_b32 v34, s59 :: v_dual_mov_b32 v33, s9
-; GFX11-NEXT: v_dual_mov_b32 v32, s60 :: v_dual_mov_b32 v31, s61
-; GFX11-NEXT: v_dual_mov_b32 v30, s8 :: v_dual_mov_b32 v29, s62
-; GFX11-NEXT: v_dual_mov_b32 v27, s63 :: v_dual_mov_b32 v28, s96
-; GFX11-NEXT: v_dual_mov_b32 v26, s72 :: v_dual_mov_b32 v25, s7
-; GFX11-NEXT: v_dual_mov_b32 v24, s73 :: v_dual_mov_b32 v23, s28
-; GFX11-NEXT: v_dual_mov_b32 v21, s29 :: v_dual_mov_b32 v22, s6
-; GFX11-NEXT: v_dual_mov_b32 v53, s87 :: v_dual_mov_b32 v54, s86
-; GFX11-NEXT: v_dual_mov_b32 v5, s85 :: v_dual_mov_b32 v12, s5
-; GFX11-NEXT: v_dual_mov_b32 v65, s4 :: v_dual_mov_b32 v66, s48
-; GFX11-NEXT: v_dual_mov_b32 v55, s81 :: v_dual_mov_b32 v64, s84
-; GFX11-NEXT: v_dual_mov_b32 v69, s83 :: v_dual_mov_b32 v70, s82
-; GFX11-NEXT: v_dual_mov_b32 v67, s70 :: v_dual_mov_b32 v68, s80
-; GFX11-NEXT: v_dual_mov_b32 v80, s71 :: v_dual_mov_b32 v19, s39
-; GFX11-NEXT: v_dual_mov_b32 v71, s66 :: v_dual_mov_b32 v20, s69
-; GFX11-NEXT: v_dual_mov_b32 v82, s68 :: v_dual_mov_b32 v17, s67
-; GFX11-NEXT: v_dual_mov_b32 v81, s55 :: v_dual_mov_b32 v18, s65
-; GFX11-NEXT: v_dual_mov_b32 v84, s38 :: v_dual_mov_b32 v15, s64
-; GFX11-NEXT: v_dual_mov_b32 v83, s51 :: v_dual_mov_b32 v16, s54
-; GFX11-NEXT: v_dual_mov_b32 v86, s53 :: v_dual_mov_b32 v13, s52
-; GFX11-NEXT: v_dual_mov_b32 v85, s36 :: v_dual_mov_b32 v14, s50
-; GFX11-NEXT: v_dual_mov_b32 v1, s74 :: v_dual_mov_b32 v2, s76
-; GFX11-NEXT: v_dual_mov_b32 v3, s78 :: v_dual_mov_b32 v4, s88
-; GFX11-NEXT: v_dual_mov_b32 v8, s90 :: v_dual_mov_b32 v9, s92
-; GFX11-NEXT: s_mov_b32 s58, s11
-; GFX11-NEXT: v_readlane_b32 s59, v43, 8
-; GFX11-NEXT: v_readlane_b32 s72, v43, 9
-; GFX11-NEXT: v_readlane_b32 s60, v43, 10
-; GFX11-NEXT: v_readlane_b32 s61, v43, 11
-; GFX11-NEXT: v_readlane_b32 s62, v43, 12
-; GFX11-NEXT: v_readlane_b32 s63, v43, 13
-; GFX11-NEXT: v_readlane_b32 s73, v43, 14
-; GFX11-NEXT: v_readlane_b32 s13, v43, 15
-; GFX11-NEXT: v_readlane_b32 s15, v43, 16
-; GFX11-NEXT: v_readlane_b32 s41, v43, 17
-; GFX11-NEXT: v_readlane_b32 s43, v43, 18
-; GFX11-NEXT: v_readlane_b32 s56, v43, 19
-; GFX11-NEXT: v_readlane_b32 s11, v43, 20
-; GFX11-NEXT: v_readlane_b32 s57, v43, 21
-; GFX11-NEXT: v_readlane_b32 s10, v43, 22
-; GFX11-NEXT: v_readlane_b32 s74, v43, 23
-; GFX11-NEXT: v_readlane_b32 s9, v43, 24
-; GFX11-NEXT: v_readlane_b32 s75, v43, 25
-; GFX11-NEXT: v_readlane_b32 s8, v43, 26
-; GFX11-NEXT: v_readlane_b32 s76, v43, 27
-; GFX11-NEXT: v_readlane_b32 s77, v43, 28
-; GFX11-NEXT: v_readlane_b32 s78, v43, 29
-; GFX11-NEXT: v_readlane_b32 s79, v43, 30
-; GFX11-NEXT: v_readlane_b32 s88, v43, 31
-; GFX11-NEXT: v_readlane_b32 s89, v42, 0
-; GFX11-NEXT: v_readlane_b32 s90, v42, 1
-; GFX11-NEXT: v_readlane_b32 s91, v42, 2
-; GFX11-NEXT: v_readlane_b32 s92, v42, 3
-; GFX11-NEXT: v_readlane_b32 s47, v42, 4
-; GFX11-NEXT: v_readlane_b32 s93, v42, 5
-; GFX11-NEXT: v_readlane_b32 vcc_hi, v43, 7
-; GFX11-NEXT: v_readlane_b32 s46, v42, 6
-; GFX11-NEXT: v_readlane_b32 s31, v43, 1
-; GFX11-NEXT: v_readlane_b32 s95, v42, 7
-; GFX11-NEXT: v_readlane_b32 s45, v42, 8
-; GFX11-NEXT: v_readlane_b32 s35, v43, 5
-; GFX11-NEXT: .LBB91_5: ; %end
-; GFX11-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-NEXT: s_lshl_b32 s4, s104, 8
-; GFX11-NEXT: s_and_b32 s5, s103, 0xff
-; GFX11-NEXT: s_lshl_b32 s6, s42, 8
-; GFX11-NEXT: s_or_b32 s0, s0, s4
-; GFX11-NEXT: s_or_b32 s4, s5, s6
-; GFX11-NEXT: s_and_b32 s1, s1, 0xff
-; GFX11-NEXT: s_lshl_b32 s5, s102, 8
-; GFX11-NEXT: s_and_b32 s6, s58, 0xff
-; GFX11-NEXT: s_lshl_b32 s7, s101, 8
-; GFX11-NEXT: s_or_b32 s1, s1, s5
-; GFX11-NEXT: s_or_b32 s5, s6, s7
-; GFX11-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX11-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-NEXT: s_and_b32 s1, s1, 0xffff
-; GFX11-NEXT: s_lshl_b32 s5, s5, 16
-; GFX11-NEXT: s_or_b32 s0, s0, s4
-; GFX11-NEXT: s_or_b32 s1, s1, s5
-; GFX11-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-NEXT: s_lshl_b32 s4, s100, 8
-; GFX11-NEXT: s_and_b32 s5, s99, 0xff
-; GFX11-NEXT: s_lshl_b32 s6, s40, 8
-; GFX11-NEXT: s_or_b32 s2, s2, s4
-; GFX11-NEXT: s_or_b32 s4, s5, s6
-; GFX11-NEXT: s_and_b32 s3, s3, 0xff
-; GFX11-NEXT: s_lshl_b32 s5, s45, 8
-; GFX11-NEXT: s_and_b32 s6, s59, 0xff
-; GFX11-NEXT: s_lshl_b32 s7, s95, 8
-; GFX11-NEXT: s_or_b32 s3, s3, s5
-; GFX11-NEXT: s_or_b32 s5, s6, s7
-; GFX11-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX11-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-NEXT: s_and_b32 s3, s3, 0xffff
-; GFX11-NEXT: s_lshl_b32 s5, s5, 16
-; GFX11-NEXT: s_or_b32 s2, s2, s4
-; GFX11-NEXT: s_or_b32 s3, s3, s5
-; GFX11-NEXT: v_dual_mov_b32 v97, s0 :: v_dual_mov_b32 v98, s1
-; GFX11-NEXT: v_dual_mov_b32 v99, s2 :: v_dual_mov_b32 v100, s3
-; GFX11-NEXT: s_and_b32 s0, s16, 0xff
-; GFX11-NEXT: s_lshl_b32 s1, s46, 8
-; GFX11-NEXT: s_and_b32 s2, s93, 0xff
-; GFX11-NEXT: s_lshl_b32 s3, s14, 8
-; GFX11-NEXT: s_or_b32 s0, s0, s1
-; GFX11-NEXT: s_or_b32 s1, s2, s3
-; GFX11-NEXT: s_and_b32 s2, s17, 0xff
-; GFX11-NEXT: s_lshl_b32 s3, s47, 8
-; GFX11-NEXT: s_and_b32 s4, s72, 0xff
-; GFX11-NEXT: s_lshl_b32 s5, s92, 8
-; GFX11-NEXT: s_or_b32 s2, s2, s3
-; GFX11-NEXT: s_or_b32 s3, s4, s5
-; GFX11-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX11-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-NEXT: s_or_b32 s0, s0, s1
-; GFX11-NEXT: s_or_b32 s1, s2, s3
-; GFX11-NEXT: s_and_b32 s2, s18, 0xff
-; GFX11-NEXT: s_lshl_b32 s3, s91, 8
-; GFX11-NEXT: s_and_b32 s4, s90, 0xff
-; GFX11-NEXT: s_lshl_b32 s5, s12, 8
-; GFX11-NEXT: s_or_b32 s2, s2, s3
-; GFX11-NEXT: s_or_b32 s3, s4, s5
-; GFX11-NEXT: s_and_b32 s4, s19, 0xff
-; GFX11-NEXT: s_lshl_b32 s5, s89, 8
-; GFX11-NEXT: s_and_b32 s6, s60, 0xff
-; GFX11-NEXT: s_lshl_b32 s7, s88, 8
-; GFX11-NEXT: s_or_b32 s4, s4, s5
-; GFX11-NEXT: s_or_b32 s5, s6, s7
-; GFX11-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX11-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX11-NEXT: s_lshl_b32 s5, s5, 16
-; GFX11-NEXT: s_or_b32 s2, s2, s3
-; GFX11-NEXT: s_or_b32 s3, s4, s5
-; GFX11-NEXT: v_dual_mov_b32 v112, s0 :: v_dual_mov_b32 v113, s1
-; GFX11-NEXT: v_dual_mov_b32 v114, s2 :: v_dual_mov_b32 v115, s3
-; GFX11-NEXT: s_and_b32 s0, s20, 0xff
-; GFX11-NEXT: s_lshl_b32 s1, s79, 8
-; GFX11-NEXT: s_and_b32 s2, s78, 0xff
-; GFX11-NEXT: s_lshl_b32 s3, s30, 8
-; GFX11-NEXT: s_or_b32 s0, s0, s1
-; GFX11-NEXT: s_or_b32 s1, s2, s3
-; GFX11-NEXT: s_and_b32 s2, s21, 0xff
-; GFX11-NEXT: s_lshl_b32 s3, s77, 8
-; GFX11-NEXT: s_and_b32 s4, s61, 0xff
-; GFX11-NEXT: s_lshl_b32 s5, s76, 8
-; GFX11-NEXT: s_or_b32 s2, s2, s3
-; GFX11-NEXT: s_or_b32 s3, s4, s5
-; GFX11-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX11-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-NEXT: s_or_b32 s0, s0, s1
-; GFX11-NEXT: s_or_b32 s1, s2, s3
-; GFX11-NEXT: s_and_b32 s2, s22, 0xff
-; GFX11-NEXT: s_lshl_b32 s3, s8, 8
-; GFX11-NEXT: s_and_b32 s4, s75, 0xff
-; GFX11-NEXT: s_lshl_b32 s5, s94, 8
-; GFX11-NEXT: s_or_b32 s2, s2, s3
-; GFX11-NEXT: s_or_b32 s3, s4, s5
-; GFX11-NEXT: s_and_b32 s4, s23, 0xff
-; GFX11-NEXT: s_lshl_b32 s5, s9, 8
-; GFX11-NEXT: s_and_b32 s6, s62, 0xff
-; GFX11-NEXT: s_lshl_b32 s7, s74, 8
-; GFX11-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX11-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-NEXT: s_or_b32 s4, s4, s5
-; GFX11-NEXT: s_or_b32 s5, s6, s7
-; GFX11-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX11-NEXT: s_lshl_b32 s5, s5, 16
-; GFX11-NEXT: s_or_b32 s2, s2, s3
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: scratch_store_b128 v0, v[97:100], off
-; GFX11-NEXT: scratch_store_b128 v0, v[112:115], off offset:16
-; GFX11-NEXT: s_or_b32 s3, s4, s5
-; GFX11-NEXT: v_dual_mov_b32 v97, s0 :: v_dual_mov_b32 v98, s1
-; GFX11-NEXT: v_dual_mov_b32 v99, s2 :: v_dual_mov_b32 v100, s3
-; GFX11-NEXT: s_and_b32 s0, s24, 0xff
-; GFX11-NEXT: s_lshl_b32 s1, s10, 8
-; GFX11-NEXT: s_and_b32 s2, s57, 0xff
-; GFX11-NEXT: s_lshl_b32 s4, s34, 8
-; GFX11-NEXT: s_or_b32 s0, s0, s1
-; GFX11-NEXT: s_or_b32 s1, s2, s4
-; GFX11-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: s_lshl_b32 s2, s11, 8
-; GFX11-NEXT: s_or_b32 s0, s0, s1
-; GFX11-NEXT: s_and_b32 s1, s25, 0xff
-; GFX11-NEXT: s_and_b32 s3, s63, 0xff
-; GFX11-NEXT: s_lshl_b32 s4, s56, 8
-; GFX11-NEXT: s_or_b32 s1, s1, s2
-; GFX11-NEXT: s_or_b32 s2, s3, s4
-; GFX11-NEXT: s_and_b32 s1, s1, 0xffff
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-NEXT: s_and_b32 s3, s26, 0xff
-; GFX11-NEXT: s_lshl_b32 s4, s43, 8
-; GFX11-NEXT: s_or_b32 s1, s1, s2
-; GFX11-NEXT: s_or_b32 s2, s3, s4
-; GFX11-NEXT: s_and_b32 s3, s41, 0xff
-; GFX11-NEXT: s_lshl_b32 s4, vcc_lo, 8
-; GFX11-NEXT: s_lshl_b32 s5, s15, 8
-; GFX11-NEXT: s_or_b32 s3, s3, s4
-; GFX11-NEXT: s_and_b32 s4, s27, 0xff
-; GFX11-NEXT: s_lshl_b32 s6, s13, 8
-; GFX11-NEXT: s_or_b32 s4, s4, s5
-; GFX11-NEXT: s_and_b32 s5, s73, 0xff
-; GFX11-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX11-NEXT: s_or_b32 s5, s5, s6
-; GFX11-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX11-NEXT: s_lshl_b32 s5, s5, 16
-; GFX11-NEXT: v_dual_mov_b32 v112, s0 :: v_dual_and_b32 v23, 0xff, v23
-; GFX11-NEXT: v_dual_mov_b32 v113, s1 :: v_dual_lshlrev_b32 v6, 8, v6
-; GFX11-NEXT: s_or_b32 s2, s2, s3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v114, s2 :: v_dual_lshlrev_b32 v11, 8, v11
-; GFX11-NEXT: s_or_b32 s3, s4, s5
-; GFX11-NEXT: v_dual_mov_b32 v115, s3 :: v_dual_and_b32 v96, 0xff, v96
-; GFX11-NEXT: v_or_b32_e32 v6, v23, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v7, 8, v7
-; GFX11-NEXT: v_lshlrev_b32_e32 v13, 8, v13
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_or_b32_e32 v11, v96, v11
-; GFX11-NEXT: v_lshlrev_b32_e32 v10, 8, v10
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v24
-; GFX11-NEXT: v_lshlrev_b32_e32 v14, 8, v14
-; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT: v_lshlrev_b32_e32 v15, 8, v15
-; GFX11-NEXT: v_lshlrev_b32_e32 v9, 8, v9
-; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v16
-; GFX11-NEXT: v_lshlrev_b32_e32 v8, 8, v8
-; GFX11-NEXT: v_or_b32_e32 v23, v6, v11
-; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v21
-; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v22
-; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v87
-; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v26
-; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v86
-; GFX11-NEXT: v_or_b32_e32 v6, v6, v7
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 8, v4
-; GFX11-NEXT: v_or_b32_e32 v7, v11, v21
-; GFX11-NEXT: v_or_b32_e32 v11, v22, v13
-; GFX11-NEXT: v_or_b32_e32 v10, v26, v10
-; GFX11-NEXT: v_or_b32_e32 v13, v24, v14
-; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v25
-; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v85
-; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v29
-; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v84
-; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v27
-; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v28
-; GFX11-NEXT: v_lshlrev_b32_e32 v27, 8, v83
-; GFX11-NEXT: v_or_b32_e32 v14, v14, v21
-; GFX11-NEXT: v_or_b32_e32 v15, v22, v15
-; GFX11-NEXT: v_or_b32_e32 v9, v24, v9
-; GFX11-NEXT: v_or_b32_e32 v16, v25, v16
-; GFX11-NEXT: v_or_b32_e32 v21, v26, v27
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT: v_or_b32_e32 v24, v6, v7
-; GFX11-NEXT: v_or_b32_e32 v25, v11, v10
-; GFX11-NEXT: v_or_b32_e32 v26, v13, v14
-; GFX11-NEXT: v_or_b32_e32 v6, v15, v9
-; GFX11-NEXT: v_or_b32_e32 v7, v16, v21
-; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v32
-; GFX11-NEXT: v_lshlrev_b32_e32 v10, 8, v17
-; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v82
-; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v31
-; GFX11-NEXT: v_lshlrev_b32_e32 v14, 8, v18
-; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v30
-; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v81
-; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v35
-; GFX11-NEXT: v_lshlrev_b32_e32 v18, 8, v19
-; GFX11-NEXT: v_or_b32_e32 v9, v9, v10
-; GFX11-NEXT: v_or_b32_e32 v8, v11, v8
-; GFX11-NEXT: v_or_b32_e32 v10, v13, v14
-; GFX11-NEXT: v_or_b32_e32 v11, v15, v16
-; GFX11-NEXT: v_or_b32_e32 v13, v17, v18
-; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v80
-; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v34
-; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v20
-; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v33
-; GFX11-NEXT: v_lshlrev_b32_e32 v18, 8, v71
-; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v38
-; GFX11-NEXT: v_lshlrev_b32_e32 v20, 8, v70
-; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v69
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 8, v3
-; GFX11-NEXT: v_or_b32_e32 v4, v14, v4
-; GFX11-NEXT: v_or_b32_e32 v14, v15, v16
-; GFX11-NEXT: v_or_b32_e32 v15, v17, v18
-; GFX11-NEXT: v_or_b32_e32 v16, v19, v20
-; GFX11-NEXT: v_or_b32_e32 v3, v21, v3
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_or_b32_e32 v8, v9, v8
-; GFX11-NEXT: v_or_b32_e32 v9, v10, v11
-; GFX11-NEXT: v_or_b32_e32 v13, v13, v4
-; GFX11-NEXT: v_or_b32_e32 v14, v14, v15
-; GFX11-NEXT: v_or_b32_e32 v15, v16, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v36
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 8, v68
-; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v37
-; GFX11-NEXT: v_lshlrev_b32_e32 v11, 8, v67
-; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v49
-; GFX11-NEXT: v_lshlrev_b32_e32 v17, 8, v66
-; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v65
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v39
-; GFX11-NEXT: v_lshlrev_b32_e32 v20, 8, v64
-; GFX11-NEXT: v_or_b32_e32 v3, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v4, v10, v11
-; GFX11-NEXT: v_or_b32_e32 v10, v16, v17
-; GFX11-NEXT: v_or_b32_e32 v2, v18, v2
-; GFX11-NEXT: v_or_b32_e32 v11, v19, v20
-; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v48
-; GFX11-NEXT: v_lshlrev_b32_e32 v17, 8, v55
-; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v52
-; GFX11-NEXT: v_lshlrev_b32_e32 v19, 8, v54
-; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v53
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v51
-; GFX11-NEXT: v_lshlrev_b32_e32 v12, 8, v12
-; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v50
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 8, v5
-; GFX11-NEXT: v_or_b32_e32 v16, v16, v17
-; GFX11-NEXT: v_or_b32_e32 v17, v18, v19
-; GFX11-NEXT: v_or_b32_e32 v1, v20, v1
-; GFX11-NEXT: v_or_b32_e32 v12, v21, v12
-; GFX11-NEXT: v_or_b32_e32 v5, v22, v5
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v16
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_or_b32_e32 v16, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v1, v10, v2
-; GFX11-NEXT: v_or_b32_e32 v2, v11, v18
-; GFX11-NEXT: v_or_b32_e32 v3, v17, v19
-; GFX11-NEXT: v_or_b32_e32 v4, v12, v5
-; GFX11-NEXT: s_clause 0x5
-; GFX11-NEXT: scratch_store_b128 v0, v[97:100], off offset:32
-; GFX11-NEXT: scratch_store_b128 v0, v[112:115], off offset:48
-; GFX11-NEXT: scratch_store_b128 v0, v[23:26], off offset:64
-; GFX11-NEXT: scratch_store_b128 v0, v[6:9], off offset:80
-; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:96
-; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112
-; GFX11-NEXT: v_readlane_b32 s104, v41, 8
-; GFX11-NEXT: v_readlane_b32 s103, v41, 7
-; GFX11-NEXT: v_readlane_b32 s102, v41, 6
-; GFX11-NEXT: v_readlane_b32 s101, v41, 5
-; GFX11-NEXT: v_readlane_b32 s100, v41, 4
-; GFX11-NEXT: v_readlane_b32 s99, v41, 3
-; GFX11-NEXT: v_readlane_b32 s98, v41, 2
-; GFX11-NEXT: v_readlane_b32 s97, v41, 1
-; GFX11-NEXT: v_readlane_b32 s96, v41, 0
-; GFX11-NEXT: v_readlane_b32 s87, v40, 31
-; GFX11-NEXT: v_readlane_b32 s86, v40, 30
-; GFX11-NEXT: v_readlane_b32 s85, v40, 29
-; GFX11-NEXT: v_readlane_b32 s84, v40, 28
-; GFX11-NEXT: v_readlane_b32 s83, v40, 27
-; GFX11-NEXT: v_readlane_b32 s82, v40, 26
-; GFX11-NEXT: v_readlane_b32 s81, v40, 25
-; GFX11-NEXT: v_readlane_b32 s80, v40, 24
-; GFX11-NEXT: v_readlane_b32 s71, v40, 23
-; GFX11-NEXT: v_readlane_b32 s70, v40, 22
-; GFX11-NEXT: v_readlane_b32 s69, v40, 21
-; GFX11-NEXT: v_readlane_b32 s68, v40, 20
-; GFX11-NEXT: v_readlane_b32 s67, v40, 19
-; GFX11-NEXT: v_readlane_b32 s66, v40, 18
-; GFX11-NEXT: v_readlane_b32 s65, v40, 17
-; GFX11-NEXT: v_readlane_b32 s64, v40, 16
-; GFX11-NEXT: v_readlane_b32 s55, v40, 15
-; GFX11-NEXT: v_readlane_b32 s54, v40, 14
-; GFX11-NEXT: v_readlane_b32 s53, v40, 13
-; GFX11-NEXT: v_readlane_b32 s52, v40, 12
-; GFX11-NEXT: v_readlane_b32 s51, v40, 11
-; GFX11-NEXT: v_readlane_b32 s50, v40, 10
-; GFX11-NEXT: v_readlane_b32 s49, v40, 9
-; GFX11-NEXT: v_readlane_b32 s48, v40, 8
-; GFX11-NEXT: v_readlane_b32 s39, v40, 7
-; GFX11-NEXT: v_readlane_b32 s38, v40, 6
-; GFX11-NEXT: v_readlane_b32 s37, v40, 5
-; GFX11-NEXT: v_readlane_b32 s36, v40, 4
-; GFX11-NEXT: v_readlane_b32 s35, v40, 3
-; GFX11-NEXT: v_readlane_b32 s34, v40, 2
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
-; GFX11-NEXT: v_readlane_b32 s30, v40, 0
-; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: s_clause 0x3
-; GFX11-NEXT: scratch_load_b32 v40, off, s32
-; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
-; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:8
-; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:12
-; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v128i8_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s4, -1
+; GFX11-TRUE16-NEXT: s_clause 0x3
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:12
+; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s96, 0
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s72, v1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s73, v2
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s97, 1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s62, v3
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s63, v4
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s60, v5
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s34, 2
+; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s98, 2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s61, v6
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s58, v7
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s59, v8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s35, 3
+; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s99, 3
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s46, v9
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s47, v10
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s44, v11
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s36, 4
+; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s100, 4
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s45, v12
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s42, v13
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s43, v14
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s37, 5
+; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s101, 5
+; GFX11-TRUE16-NEXT: s_mov_b32 vcc_hi, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s38, 6
+; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s102, 6
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s39, 7
+; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s103, 7
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s48, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s104, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s49, 9
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s50, 10
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s51, 11
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s52, 12
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s53, 13
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s54, 14
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s55, 15
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s64, 16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s65, 17
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s66, 18
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s67, 19
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s68, 20
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s69, 21
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s70, 22
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s71, 23
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s80, 24
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s81, 25
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s82, 26
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s83, 27
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s84, 28
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s85, 29
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s86, 30
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s87, 31
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB91_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s27, 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[26:27], 24
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 15
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s99, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s100, s2, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s101, s1, 24
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 14
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s27, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s102, s1, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s103, s0, 16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s104, s0, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s85, s43, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s43, 16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 17
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s26, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s43, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s87, s42, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s86, s42, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 18
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s25, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s81, s45, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s98, s45, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s84, s45, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 19
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s44, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s70, s47, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s97, s47, 16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 13
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s25, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s80, s47, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s83, s46, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s82, s46, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 20
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s66, s59, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s59, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s69, s59, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 21
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s24, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s71, s58, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s58, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s55, s61, 24
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 22
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s23, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s61, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s65, s61, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s68, s60, 16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 23
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s67, s60, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s51, s63, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s96, s63, 16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 12
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s23, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s54, s63, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s62, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s64, s62, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s73, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s73, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s50, s73, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 25
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s22, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s53, s72, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s52, s72, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s29, 24
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 26
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s21, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s29, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s28, 16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 27
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s28, 8
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[16:17], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[40:41], s[2:3], 24
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 11
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s21, 8
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[56:57], s[0:1], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[74:75], s[42:43], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[76:77], s[44:45], 24
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 28
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[78:79], s[46:47], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[88:89], s[58:59], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[90:91], s[60:61], 24
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 29
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s20, 8
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[92:93], s[62:63], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[94:95], s[72:73], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[30:31], s[28:29], 24
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 30
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s19, 24
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 31
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s19, 16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 10
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s19, 8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 1
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 2
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s17, 24
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 3
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s17, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 9
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s17, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 4
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 5
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 8
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 6
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 24
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 7
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s44, 16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s12, 6
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s13, 7
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[24:25], 24
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s12, 4
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s13, 5
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[22:23], 24
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s12, 2
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s13, 3
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s12, 0
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s13, 1
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[18:19], 24
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_hi
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB91_4
+; GFX11-TRUE16-NEXT: .LBB91_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s29, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s29, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s78, s28, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v2
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s1, 0xffff0000
+; GFX11-TRUE16-NEXT: s_and_b32 s15, s45, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s28, s45, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s43, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s43, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s6, s73, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s77, s73, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s76, s72, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s75, s72, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s11, s63, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s74, s63, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s73, s62, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_lshl_b32 s72, s62, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s62, s61, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s63, s61, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc_lo
+; GFX11-TRUE16-NEXT: s_and_b32 s61, s60, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s57, s60, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s40, s59, 0xffff0000
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s45, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s56, s59, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s29, s58, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s14, s58, 16
+; GFX11-TRUE16-NEXT: s_bfe_u32 s4, s45, 0x10010
+; GFX11-TRUE16-NEXT: s_and_b32 s12, s47, 0xffff0000
+; GFX11-TRUE16-NEXT: s_add_i32 s43, s4, s45
+; GFX11-TRUE16-NEXT: s_lshl_b32 s13, s47, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s47, s46, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s41, s46, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s10, s44, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s44, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s42, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s42, 16
+; GFX11-TRUE16-NEXT: s_addk_i32 s43, 0x7fff
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s45, 22
+; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s42, s45, s43
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s78
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v2
+; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s42, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v3, v6
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s1, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: s_bfe_u32 s43, s1, 0x10010
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.l
+; GFX11-TRUE16-NEXT: s_add_i32 s43, s43, s1
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s1, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s43, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: s_cselect_b32 s1, s1, s43
+; GFX11-TRUE16-NEXT: s_and_b32 s42, s0, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s42
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s77
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v25.l
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s42, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s6
+; GFX11-TRUE16-NEXT: s_bfe_u32 s6, s42, 0x10010
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_add_i32 s6, s6, s42
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s42, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v1
+; GFX11-TRUE16-NEXT: s_and_b32 s43, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s6, s42, s6
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s6, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v8, v3
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_bfe_u32 s42, s0, 0x10010
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_add_i32 s42, s42, s0
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s0, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s42, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s43, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s0, s0, s42
+; GFX11-TRUE16-NEXT: s_and_b32 s42, s3, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s42
+; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v27.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s42, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s76
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v6
+; GFX11-TRUE16-NEXT: s_bfe_u32 s43, s42, 0x10010
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_add_i32 s43, s43, s42
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s42, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s43, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s42, s42, s43
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s75
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v3, v7
+; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s42, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s3, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: s_bfe_u32 s43, s3, 0x10010
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: s_add_i32 s43, s43, s3
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s3, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s43, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s3, s3, s43
+; GFX11-TRUE16-NEXT: s_and_b32 s42, s2, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s42
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s74
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s42, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v26.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v28.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s11
+; GFX11-TRUE16-NEXT: s_bfe_u32 s11, s42, 0x10010
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: s_add_i32 s11, s11, s42
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s42, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s11, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v2
+; GFX11-TRUE16-NEXT: s_and_b32 s43, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s11, s42, s11
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v7
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s11, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v6, v3
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s2, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_bfe_u32 s42, s2, 0x10010
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_add_i32 s42, s42, s2
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s2, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s42, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s43, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s2, s2, s42
+; GFX11-TRUE16-NEXT: s_and_b32 s42, s17, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s42
+; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s73
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v30.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s42, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6
+; GFX11-TRUE16-NEXT: s_bfe_u32 s43, s42, 0x10010
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_add_i32 s43, s43, s42
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s42, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s43, 0x7fff
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s72
+; GFX11-TRUE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s42, s42, s43
+; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v3, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s42, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s17, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v29.l
+; GFX11-TRUE16-NEXT: s_bfe_u32 s43, s17, 0x10010
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v31.l
+; GFX11-TRUE16-NEXT: s_add_i32 s43, s43, s17
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s17, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s43, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s17, s17, s43
+; GFX11-TRUE16-NEXT: s_and_b32 s42, s16, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s42
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s63
+; GFX11-TRUE16-NEXT: s_lshr_b32 s17, s17, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s42, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[13:14], 24, v[11:12]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[4:5]
+; GFX11-TRUE16-NEXT: s_bfe_u32 s43, s42, 0x10010
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: s_add_i32 s43, s43, s42
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s42, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s43, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-TRUE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s42, s42, s43
+; GFX11-TRUE16-NEXT: s_lshl_b32 s16, s16, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s62
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s42, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s16, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v6, v3
+; GFX11-TRUE16-NEXT: s_bfe_u32 s43, s16, 0x10010
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8
+; GFX11-TRUE16-NEXT: s_add_i32 s43, s43, s16
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s16, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s43, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s16, s16, s43
+; GFX11-TRUE16-NEXT: s_and_b32 s42, s19, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s42
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_lshr_b32 s16, s16, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s61
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s42, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v33.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1
+; GFX11-TRUE16-NEXT: s_bfe_u32 s43, s42, 0x10010
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_add_i32 s43, s43, s42
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s42, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s43, 0x7fff
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s57
+; GFX11-TRUE16-NEXT: s_and_b32 s45, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s42, s42, s43
+; GFX11-TRUE16-NEXT: s_lshl_b32 s19, s19, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v3, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s42, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s19, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: s_bfe_u32 s43, s19, 0x10010
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v32.l
+; GFX11-TRUE16-NEXT: s_add_i32 s43, s43, s19
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s19, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s43, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s19, s19, s43
+; GFX11-TRUE16-NEXT: s_and_b32 s42, s18, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s42
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s56
+; GFX11-TRUE16-NEXT: s_lshr_b32 s19, s19, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s42, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v34.l
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s45, s17, s60
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s44, s16, s44
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s40
+; GFX11-TRUE16-NEXT: s_bfe_u32 s40, s42, 0x10010
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: s_add_i32 s40, s40, s42
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s42, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s40, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v2
+; GFX11-TRUE16-NEXT: s_and_b32 s43, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s40, s42, s40
+; GFX11-TRUE16-NEXT: s_lshl_b32 s18, s18, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s18
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s40, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s18, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v6, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8
+; GFX11-TRUE16-NEXT: s_bfe_u32 s42, s18, 0x10010
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
+; GFX11-TRUE16-NEXT: s_add_i32 s42, s42, s18
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s18, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s42, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s43, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s18, s18, s42
+; GFX11-TRUE16-NEXT: s_and_b32 s42, s21, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s42
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s29
+; GFX11-TRUE16-NEXT: s_lshr_b32 s18, s18, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s29, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6
+; GFX11-TRUE16-NEXT: s_bfe_u32 s42, s29, 0x10010
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_add_i32 s42, s42, s29
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s29, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s42, 0x7fff
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s14
+; GFX11-TRUE16-NEXT: s_and_b32 s43, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s29, s29, s42
+; GFX11-TRUE16-NEXT: s_lshl_b32 s21, s21, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s21
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v3, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s29, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s42, s2, s11
+; GFX11-TRUE16-NEXT: s_bfe_u32 s21, s14, 0x10010
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v35.l
+; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, s14
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s14, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s21, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s29, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s14, s14, s21
+; GFX11-TRUE16-NEXT: s_and_b32 s21, s20, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s21
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s13
+; GFX11-TRUE16-NEXT: s_lshr_b32 s21, s14, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v36.l
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s43, s3, s59
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s12
+; GFX11-TRUE16-NEXT: s_bfe_u32 s12, s13, 0x10010
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: s_add_i32 s12, s12, s13
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s13, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s12, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2
+; GFX11-TRUE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s12, s13, s12
+; GFX11-TRUE16-NEXT: s_lshl_b32 s13, s20, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s13
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s12, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v6, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8
+; GFX11-TRUE16-NEXT: s_bfe_u32 s14, s13, 0x10010
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
+; GFX11-TRUE16-NEXT: s_add_i32 s14, s14, s13
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s13, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s14, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s20, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s13, s13, s14
+; GFX11-TRUE16-NEXT: s_and_b32 s14, s23, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s14
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s47
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v6
+; GFX11-TRUE16-NEXT: s_bfe_u32 s20, s14, 0x10010
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_add_i32 s29, s20, s14
+; GFX11-TRUE16-NEXT: s_lshr_b32 s20, s13, 16
+; GFX11-TRUE16-NEXT: s_addk_i32 s29, 0x7fff
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s14, 22
+; GFX11-TRUE16-NEXT: s_and_b32 s13, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s13, s14, s29
+; GFX11-TRUE16-NEXT: s_lshl_b32 s14, s23, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s41
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v3, v7
+; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s13, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s28
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: s_bfe_u32 s23, s14, 0x10010
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s0, s6
+; GFX11-TRUE16-NEXT: s_add_i32 s23, s23, s14
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s14, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s23, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s13, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s13, s14, s23
+; GFX11-TRUE16-NEXT: s_and_b32 s14, s22, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s14
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s13, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s15
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v38.l
+; GFX11-TRUE16-NEXT: s_bfe_u32 s15, s14, 0x10010
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: s_add_i32 s15, s15, s14
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s14, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s15, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v2
+; GFX11-TRUE16-NEXT: s_and_b32 s13, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s13, s14, s15
+; GFX11-TRUE16-NEXT: s_lshl_b32 s14, s22, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s14
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v9, 16, 1
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s13, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v48.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v8
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v7, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: s_bfe_u32 s15, s14, 0x10010
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_add_i32 s15, s15, s14
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s14, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s15, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s22, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: s_cselect_b32 s14, s14, s15
+; GFX11-TRUE16-NEXT: s_and_b32 s15, s25, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s14, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s15
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s10
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v1.l
+; GFX11-TRUE16-NEXT: s_bfe_u32 s9, s10, 0x10010
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v7, 16, 1
+; GFX11-TRUE16-NEXT: s_add_i32 s9, s9, s10
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s10, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s9, s10, s9
+; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s10
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s9, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v8, 16, 1
+; GFX11-TRUE16-NEXT: s_bfe_u32 s10, s8, 0x10010
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v51.l
+; GFX11-TRUE16-NEXT: s_add_i32 s10, s10, s8
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s8, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s9, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s8, s8, s10
+; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s8, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v10, v8
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v50.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.h, v49.l
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s1, s58
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s7
+; GFX11-TRUE16-NEXT: s_bfe_u32 s7, s9, 0x10010
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, s9
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s9, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v8
+; GFX11-TRUE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s7, s9, s7
+; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s24, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s7, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v53.l
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: s_bfe_u32 s5, s8, 0x10010
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v6, v9
+; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, s8
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s8, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s7, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s5, s8, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s27, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v2, 16, 1
+; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s5, 16
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v8, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v10, v2
+; GFX11-TRUE16-NEXT: s_bfe_u32 s7, s4, 0x10010
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, s4
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s4, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s4, s4, s7
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s27, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s4, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v54.l
+; GFX11-TRUE16-NEXT: s_bfe_u32 s7, s5, 0x10010
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.h, v52.l
+; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, s5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x7fff
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s5, 22
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v2
+; GFX11-TRUE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s4, s5, s7
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s26, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s4, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v55.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[6:7], 24, v[22:23]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[7:8], 24, v[20:21]
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v2.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[18:19]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[9:10], 24, v[16:17]
+; GFX11-TRUE16-NEXT: s_bfe_u32 s6, s5, 0x10010
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s63
+; GFX11-TRUE16-NEXT: s_add_i32 s6, s6, s5
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s5, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s14, s5, s6
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s26, 16
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s12
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s14, 16
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s61
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s40
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 8, v65
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[1:2], 24, v[64:65]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[2:3], 24, v[68:69]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v65
+; GFX11-TRUE16-NEXT: s_bfe_u32 s12, s11, 0x10010
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v64
+; GFX11-TRUE16-NEXT: s_add_i32 s12, s12, s11
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s11, 22
+; GFX11-TRUE16-NEXT: s_addk_i32 s12, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cselect_b32 s12, s11, s12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v64
+; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s12, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 24, v69
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 8, v69
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v68
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 8, v68
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 24, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 8, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 8, v4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s62
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s72
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s27, s73
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s46, s26, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[94:95], s[8:9], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[4:5], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[44:45], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[40:41], s[42:43], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[56:57], s[28:29], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 vcc, s[46:47], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[34:35], s[10:11], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[30:31], s[6:7], 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s47, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s47, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s46, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s46, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s11, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s11, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s10, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s10, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s9, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s9, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s8, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s8, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s7, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s7, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s6, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s6, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s5, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s5, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s4, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s4, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s45, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s45, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s44, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s44, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s95, s43, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s43, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s99, s42, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s100, s42, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s101, s29, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s102, s29, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s103, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s104, s28, 8
+; GFX11-TRUE16-NEXT: s_branch .LBB91_5
+; GFX11-TRUE16-NEXT: .LBB91_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr104
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr103
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr102
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr101
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr100
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr99
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr64
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr96
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr68
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr65
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr71
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr69
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr66
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr82
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr83
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr80
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr70
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr84
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr98
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr81
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr87
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr85
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr94
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 0
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s5, 1
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 2
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s5, 3
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s74, 4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s75, 5
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s74, 6
+; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s75, 7
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74
+; GFX11-TRUE16-NEXT: s_branch .LBB91_2
+; GFX11-TRUE16-NEXT: .LBB91_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s94 :: v_dual_mov_b32 v14, s30
+; GFX11-TRUE16-NEXT: v_readlane_b32 s94, v43, 2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v96, s37 :: v_dual_mov_b32 v87, s34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s49 :: v_dual_mov_b32 v5, s35
+; GFX11-TRUE16-NEXT: v_readlane_b32 s95, v43, 3
+; GFX11-TRUE16-NEXT: v_readlane_b32 vcc_lo, v43, 6
+; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v43, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v43, 4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, s42 :: v_dual_mov_b32 v54, s43
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s10 :: v_dual_mov_b32 v53, s44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s45 :: v_dual_mov_b32 v49, s98
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, s46 :: v_dual_mov_b32 v38, s47
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, s97 :: v_dual_mov_b32 v39, s58
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, s59 :: v_dual_mov_b32 v36, s60
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, s9 :: v_dual_mov_b32 v32, s61
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s8 :: v_dual_mov_b32 v33, s62
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, s63 :: v_dual_mov_b32 v30, s72
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, s96 :: v_dual_mov_b32 v26, s73
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s7 :: v_dual_mov_b32 v27, s28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s29 :: v_dual_mov_b32 v25, s6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s87 :: v_dual_mov_b32 v64, s86
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s85 :: v_dual_mov_b32 v10, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, s4 :: v_dual_mov_b32 v68, s48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, s81 :: v_dual_mov_b32 v66, s84
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v70, s83 :: v_dual_mov_b32 v69, s70
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s82 :: v_dual_mov_b32 v23, s80
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v80, s71 :: v_dual_mov_b32 v71, s66
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s39 :: v_dual_mov_b32 v21, s69
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v82, s68 :: v_dual_mov_b32 v81, s55
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s67 :: v_dual_mov_b32 v19, s65
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v84, s38 :: v_dual_mov_b32 v83, s51
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s64 :: v_dual_mov_b32 v17, s54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v86, s53 :: v_dual_mov_b32 v11, s52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v85, s36 :: v_dual_mov_b32 v12, s50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s74 :: v_dual_mov_b32 v2, s76
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s78 :: v_dual_mov_b32 v7, s88
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s90 :: v_dual_mov_b32 v9, s92
+; GFX11-TRUE16-NEXT: s_mov_b32 s58, s11
+; GFX11-TRUE16-NEXT: v_readlane_b32 s59, v43, 8
+; GFX11-TRUE16-NEXT: v_readlane_b32 s60, v43, 9
+; GFX11-TRUE16-NEXT: v_readlane_b32 s61, v43, 10
+; GFX11-TRUE16-NEXT: v_readlane_b32 s62, v43, 11
+; GFX11-TRUE16-NEXT: v_readlane_b32 s63, v43, 12
+; GFX11-TRUE16-NEXT: v_readlane_b32 s72, v43, 13
+; GFX11-TRUE16-NEXT: v_readlane_b32 s73, v43, 14
+; GFX11-TRUE16-NEXT: v_readlane_b32 s13, v43, 15
+; GFX11-TRUE16-NEXT: v_readlane_b32 s15, v43, 16
+; GFX11-TRUE16-NEXT: v_readlane_b32 s41, v43, 17
+; GFX11-TRUE16-NEXT: v_readlane_b32 s46, v43, 18
+; GFX11-TRUE16-NEXT: v_readlane_b32 s47, v43, 19
+; GFX11-TRUE16-NEXT: v_readlane_b32 s11, v43, 20
+; GFX11-TRUE16-NEXT: v_readlane_b32 s57, v43, 21
+; GFX11-TRUE16-NEXT: v_readlane_b32 s10, v43, 22
+; GFX11-TRUE16-NEXT: v_readlane_b32 s74, v43, 23
+; GFX11-TRUE16-NEXT: v_readlane_b32 s9, v43, 24
+; GFX11-TRUE16-NEXT: v_readlane_b32 s75, v43, 25
+; GFX11-TRUE16-NEXT: v_readlane_b32 s8, v43, 26
+; GFX11-TRUE16-NEXT: v_readlane_b32 s76, v43, 27
+; GFX11-TRUE16-NEXT: v_readlane_b32 s77, v43, 28
+; GFX11-TRUE16-NEXT: v_readlane_b32 s78, v43, 29
+; GFX11-TRUE16-NEXT: v_readlane_b32 s79, v43, 30
+; GFX11-TRUE16-NEXT: v_readlane_b32 s88, v43, 31
+; GFX11-TRUE16-NEXT: v_readlane_b32 s89, v42, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s90, v42, 1
+; GFX11-TRUE16-NEXT: v_readlane_b32 s91, v42, 2
+; GFX11-TRUE16-NEXT: v_readlane_b32 s92, v42, 3
+; GFX11-TRUE16-NEXT: v_readlane_b32 s45, v42, 4
+; GFX11-TRUE16-NEXT: v_readlane_b32 s93, v42, 5
+; GFX11-TRUE16-NEXT: v_readlane_b32 vcc_hi, v43, 7
+; GFX11-TRUE16-NEXT: v_readlane_b32 s44, v42, 6
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v43, 1
+; GFX11-TRUE16-NEXT: v_readlane_b32 s95, v42, 7
+; GFX11-TRUE16-NEXT: v_readlane_b32 s43, v42, 8
+; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v43, 5
+; GFX11-TRUE16-NEXT: .LBB91_5: ; %end
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s104, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s103, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s56, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s4
+; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s6
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s102, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s6, s58, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s101, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s5
+; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s7
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16
+; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s4
+; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s100, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s99, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s40, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s4
+; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s6
+; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s43, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s6, s59, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s95, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s5
+; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s7
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16
+; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s4
+; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v97, s0 :: v_dual_mov_b32 v98, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v99, s2 :: v_dual_mov_b32 v100, s3
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s16, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s44, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s93, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s14, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1
+; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s17, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s45, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s60, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s92, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1
+; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s18, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s91, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s90, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s12, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s19, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s89, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s6, s61, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s88, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s4, s4, s5
+; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s7
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16
+; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v112, s0 :: v_dual_mov_b32 v113, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v114, s2 :: v_dual_mov_b32 v115, s3
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s20, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s79, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s78, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s30, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1
+; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s21, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s77, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s62, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s76, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1
+; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s22, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s8, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s75, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s94, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s23, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s9, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s6, s63, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s74, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: s_or_b32 s4, s4, s5
+; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s7
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16
+; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[97:100], off
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[112:115], off offset:16
+; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v97, s0 :: v_dual_mov_b32 v98, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v99, s2 :: v_dual_mov_b32 v100, s3
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s24, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s10, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s57, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s34, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1
+; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s11, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s25, 0xff
+; GFX11-TRUE16-NEXT: s_and_b32 s3, s72, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s47, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2
+; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s3, s26, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s46, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2
+; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s3, s41, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, vcc_lo, 8
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s15, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s27, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s13, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s4, s4, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s73, 0xff
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v112, s0 :: v_dual_and_b32 v27, 0xff, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v113, s1 :: v_dual_lshlrev_b32 v4, 8, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v14
+; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v115, s3 :: v_dual_and_b32 v96, 0xff, v96
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v27, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v114, s2 :: v_dual_lshlrev_b32 v5, 8, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v96, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v26
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v12
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 8, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v4, v14
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v25
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v87
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xff, v30
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xff, v86
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v26, v12
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v14, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v25, v11
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v30, v13
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v85
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xff, v33
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v84
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xff, v29
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v83
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v25, v16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v26, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v28, v17
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v29, v30
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v4, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v11, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v12, v14
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v16, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v17, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v36
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v18
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v82
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v19
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v34
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v81
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v20
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v11
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v14
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v16, v17
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v19
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v80
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v7
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v35
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v21
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v37
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v71
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v70
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v7
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v16, v17
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v18, v19
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v21
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v22, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v7
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v11, v12
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v18
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v14, v16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v17, v19
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v38
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v23
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v48
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 8, v69
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v53
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v68
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v67
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v50
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v66
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v14, v16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v18
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v19, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v20, v21
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v65
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v55
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v49
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v64
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v54
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v10
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v52
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v18
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v19, v20
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v21, v10
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v22, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v14
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v8, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v19
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v18, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v10, v3
+; GFX11-TRUE16-NEXT: s_clause 0x5
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[97:100], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[112:115], off offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[27:30], off offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[4:7], off offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[11:14], off offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[15:18], off offset:112
+; GFX11-TRUE16-NEXT: v_readlane_b32 s104, v41, 8
+; GFX11-TRUE16-NEXT: v_readlane_b32 s103, v41, 7
+; GFX11-TRUE16-NEXT: v_readlane_b32 s102, v41, 6
+; GFX11-TRUE16-NEXT: v_readlane_b32 s101, v41, 5
+; GFX11-TRUE16-NEXT: v_readlane_b32 s100, v41, 4
+; GFX11-TRUE16-NEXT: v_readlane_b32 s99, v41, 3
+; GFX11-TRUE16-NEXT: v_readlane_b32 s98, v41, 2
+; GFX11-TRUE16-NEXT: v_readlane_b32 s97, v41, 1
+; GFX11-TRUE16-NEXT: v_readlane_b32 s96, v41, 0
+; GFX11-TRUE16-NEXT: v_readlane_b32 s87, v40, 31
+; GFX11-TRUE16-NEXT: v_readlane_b32 s86, v40, 30
+; GFX11-TRUE16-NEXT: v_readlane_b32 s85, v40, 29
+; GFX11-TRUE16-NEXT: v_readlane_b32 s84, v40, 28
+; GFX11-TRUE16-NEXT: v_readlane_b32 s83, v40, 27
+; GFX11-TRUE16-NEXT: v_readlane_b32 s82, v40, 26
+; GFX11-TRUE16-NEXT: v_readlane_b32 s81, v40, 25
+; GFX11-TRUE16-NEXT: v_readlane_b32 s80, v40, 24
+; GFX11-TRUE16-NEXT: v_readlane_b32 s71, v40, 23
+; GFX11-TRUE16-NEXT: v_readlane_b32 s70, v40, 22
+; GFX11-TRUE16-NEXT: v_readlane_b32 s69, v40, 21
+; GFX11-TRUE16-NEXT: v_readlane_b32 s68, v40, 20
+; GFX11-TRUE16-NEXT: v_readlane_b32 s67, v40, 19
+; GFX11-TRUE16-NEXT: v_readlane_b32 s66, v40, 18
+; GFX11-TRUE16-NEXT: v_readlane_b32 s65, v40, 17
+; GFX11-TRUE16-NEXT: v_readlane_b32 s64, v40, 16
+; GFX11-TRUE16-NEXT: v_readlane_b32 s55, v40, 15
+; GFX11-TRUE16-NEXT: v_readlane_b32 s54, v40, 14
+; GFX11-TRUE16-NEXT: v_readlane_b32 s53, v40, 13
+; GFX11-TRUE16-NEXT: v_readlane_b32 s52, v40, 12
+; GFX11-TRUE16-NEXT: v_readlane_b32 s51, v40, 11
+; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v40, 10
+; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v40, 9
+; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v40, 8
+; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v40, 7
+; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v40, 6
+; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v40, 5
+; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v40, 4
+; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v40, 3
+; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v40, 2
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
+; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s0, -1
+; GFX11-TRUE16-NEXT: s_clause 0x3
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:12
+; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v128i8_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s4, -1
+; GFX11-FAKE16-NEXT: s_clause 0x3
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:12
+; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s96, 0
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s72, v1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s73, v2
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s97, 1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s62, v3
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s63, v4
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s60, v5
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s34, 2
+; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s98, 2
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s61, v6
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s58, v7
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s59, v8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s35, 3
+; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s99, 3
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s56, v9
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s57, v10
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s46, v11
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s36, 4
+; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s100, 4
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s47, v12
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v13
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s45, v14
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s37, 5
+; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s101, 5
+; GFX11-FAKE16-NEXT: s_mov_b32 vcc_hi, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s38, 6
+; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s102, 6
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s39, 7
+; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s103, 7
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s48, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s104, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s49, 9
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s50, 10
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s51, 11
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s52, 12
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s53, 13
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s54, 14
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s55, 15
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s64, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s65, 17
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s66, 18
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s67, 19
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s68, 20
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s69, 21
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s70, 22
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s71, 23
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s80, 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s81, 25
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s82, 26
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s83, 27
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s84, 28
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s85, 29
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s86, 30
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s87, 31
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB91_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s27, 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[26:27], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 15
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s27, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s99, s2, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s100, s2, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s101, s1, 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 14
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s27, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s1, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s102, s1, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s103, s0, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s26, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s104, s0, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s85, s45, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s45, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 17
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s26, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s45, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s87, s44, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s86, s44, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 18
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s25, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s81, s47, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s98, s47, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s84, s47, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 19
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s25, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s46, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s70, s57, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s97, s57, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 13
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s25, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s80, s57, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s83, s56, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s82, s56, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 20
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s24, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s66, s59, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s59, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s69, s59, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 21
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s24, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s71, s58, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s58, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s55, s61, 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 22
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s23, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s61, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s65, s61, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s68, s60, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 23
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s23, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s67, s60, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s51, s63, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s96, s63, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 12
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s23, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s54, s63, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s62, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s64, s62, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s22, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s73, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s73, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s50, s73, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 25
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s22, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s53, s72, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s52, s72, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s29, 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 26
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s21, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s29, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s29, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s28, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 27
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s21, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s28, 8
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[14:15], s[16:17], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[40:41], s[2:3], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 11
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s21, 8
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[42:43], s[0:1], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[74:75], s[44:45], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[76:77], s[46:47], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 28
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s20, 16
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[78:79], s[56:57], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[88:89], s[58:59], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[90:91], s[60:61], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 29
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s20, 8
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[92:93], s[62:63], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[94:95], s[72:73], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[30:31], s[28:29], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 30
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s19, 24
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 31
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s19, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 10
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s19, 8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 0
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 1
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s18, 8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 2
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s17, 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 3
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s17, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 9
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s17, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 4
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 5
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 8
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 6
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s3, 24
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 7
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s3, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s3, 8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s46, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s12, 6
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s13, 7
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[24:25], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s12, 4
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s13, 5
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[22:23], 24
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s12, 2
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s13, 3
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s12, 0
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s13, 1
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[18:19], 24
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_hi
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB91_4
+; GFX11-FAKE16-NEXT: .LBB91_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s29, 0xffff0000
+; GFX11-FAKE16-NEXT: s_and_b32 s14, s47, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s1, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s15, s47, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s6
+; GFX11-FAKE16-NEXT: s_and_b32 s8, s45, 0xffff0000
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s47, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s45, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s78, s28, 0xffff0000
+; GFX11-FAKE16-NEXT: s_bfe_u32 s6, s47, 0x10010
+; GFX11-FAKE16-NEXT: s_lshl_b32 s79, s28, 16
+; GFX11-FAKE16-NEXT: s_add_i32 s45, s6, s47
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s73, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s77, s73, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s75, s72, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s76, s72, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s11, s63, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s74, s63, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s72, s62, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s73, s62, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s63, s61, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s62, s61, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s61, s60, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s60, s60, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s41, s59, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s40, s59, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s28, s58, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s29, s58, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s13, s57, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s57, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s42, s56, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s43, s56, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s12, s46, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s46, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s44, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s44, 16
+; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s47, 22
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s47, s45
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v2
+; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s44, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s78
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s1, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s79
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s1, 0x10010
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1
+; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s1
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s1, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s1, s1, s45
+; GFX11-FAKE16-NEXT: s_and_b32 s44, s0, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s44
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v4, v6
+; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s44, 0x10010
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s44
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s44, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v5, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v6
+; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s44, s45
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s5
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s77
+; GFX11-FAKE16-NEXT: s_bfe_u32 s5, s0, 0x10010
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v22, 16, v4
+; GFX11-FAKE16-NEXT: s_add_i32 s45, s5, s0
+; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s44, 16
+; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s0, 22
+; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s0, s0, s45
+; GFX11-FAKE16-NEXT: s_and_b32 s44, s3, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s44
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v5, 16, 1
+; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v23
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v9
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v6, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v2, 16, v3
+; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s44, 0x10010
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v5
+; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s44
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s44, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s44, s45
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s76
+; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s44, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s75
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s3, v10
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 24, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s3, 0x10010
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s3
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s3, 22
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s3, s3, s45
+; GFX11-FAKE16-NEXT: s_and_b32 s44, s2, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s44
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v8, v9
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v4
+; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s44, 0x10010
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v24
+; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s44
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s44, 22
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s74
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v25, 16, v5
+; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s44, s45
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 24, v14
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s2, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s11
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: s_bfe_u32 s11, s2, 0x10010
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_add_i32 s45, s11, s2
+; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s44, 16
+; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s2, 22
+; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s2, s2, s45
+; GFX11-FAKE16-NEXT: s_and_b32 s44, s17, 0xffff0000
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v26
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s44
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v3, 16, 1
+; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v2, 16, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 16, v13
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s44, 0x10010
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s44
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s44, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s44, s45
+; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s17, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s73
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s17
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s72
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s17, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s44, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s17, 0x10010
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v27
+; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s17
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s17, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v28, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX11-FAKE16-NEXT: s_cselect_b32 s17, s17, s45
+; GFX11-FAKE16-NEXT: s_and_b32 s44, s16, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshr_b32 s17, s17, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s63
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 24, v16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v5, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v29
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v8, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s44
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v8
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s44, 0x10010
+; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s44
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s44, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s44, s45
+; GFX11-FAKE16-NEXT: s_lshl_b32 s16, s16, 16
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s44, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s16, v8
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s62
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s16, 0x10010
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s16
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s16, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s16, s16, s45
+; GFX11-FAKE16-NEXT: s_and_b32 s44, s19, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s44
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v4, 16, 1
+; GFX11-FAKE16-NEXT: s_lshr_b32 s16, s16, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v10
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s60
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v4
+; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s44, 0x10010
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s61
+; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s44
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s44, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s44, s45
+; GFX11-FAKE16-NEXT: s_lshl_b32 s19, s19, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v9
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s19
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s44, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s19, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v9, v8
+; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s19, 0x10010
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v2
+; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s19
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s19, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s19, s19, s45
+; GFX11-FAKE16-NEXT: s_and_b32 s44, s18, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v3, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s44
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: s_lshr_b32 s19, s19, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s29
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s41
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s41, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s47, s17, s72
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v3, 16, 1
+; GFX11-FAKE16-NEXT: s_bfe_u32 s44, s41, 0x10010
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_add_i32 s44, s44, s41
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s41, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s44, 0x7fff
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s40
+; GFX11-FAKE16-NEXT: s_and_b32 s45, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s41, s41, s44
+; GFX11-FAKE16-NEXT: s_lshl_b32 s18, s18, 16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v31
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v30, 16, v4
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s18, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v1, 16, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-FAKE16-NEXT: s_bfe_u32 s40, s18, 0x10010
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s28
+; GFX11-FAKE16-NEXT: s_add_i32 s44, s40, s18
+; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s41, 16
+; GFX11-FAKE16-NEXT: s_addk_i32 s44, 0x7fff
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s18, 22
+; GFX11-FAKE16-NEXT: s_and_b32 s41, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s18, s18, s44
+; GFX11-FAKE16-NEXT: s_and_b32 s41, s21, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s41
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v9, 16, 1
+; GFX11-FAKE16-NEXT: s_lshr_b32 s18, s18, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s28, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v9
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v10, 16, 1
+; GFX11-FAKE16-NEXT: s_bfe_u32 s29, s28, 0x10010
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v1
+; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, s28
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s28, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s29, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s41, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s28, s28, s29
+; GFX11-FAKE16-NEXT: s_lshl_b32 s21, s21, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s21
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s28, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v5, v10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s44, s2, s11
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s21, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v4, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: s_bfe_u32 s29, s21, 0x10010
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v2
+; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, s21
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s21, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s29, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s28, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s21, s21, s29
+; GFX11-FAKE16-NEXT: s_and_b32 s28, s20, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s28
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v10
+; GFX11-FAKE16-NEXT: s_lshr_b32 s21, s21, 16
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s45, s3, s59
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s46, s16, s46
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s13
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 24, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_bfe_u32 s28, s13, 0x10010
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v34
+; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, s13
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s13, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s28, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s29, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s13, s13, s28
+; GFX11-FAKE16-NEXT: s_lshl_b32 s20, s20, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s20
+; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v35
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s20, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v2, 16, v9
+; GFX11-FAKE16-NEXT: s_bfe_u32 s10, s20, 0x10010
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: s_add_i32 s28, s10, s20
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s13, 16
+; GFX11-FAKE16-NEXT: s_addk_i32 s28, 0x7fff
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s20, 22
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v3
+; GFX11-FAKE16-NEXT: s_and_b32 s13, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: s_cselect_b32 s13, s20, s28
+; GFX11-FAKE16-NEXT: s_and_b32 s20, s23, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s42
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s20
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s43
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s28, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v19
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: s_bfe_u32 s20, s28, 0x10010
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_add_i32 s29, s20, s28
+; GFX11-FAKE16-NEXT: s_lshr_b32 s20, s13, 16
+; GFX11-FAKE16-NEXT: s_addk_i32 s29, 0x7fff
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s28, 22
+; GFX11-FAKE16-NEXT: s_and_b32 s13, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s13, s28, s29
+; GFX11-FAKE16-NEXT: s_lshl_b32 s23, s23, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v4, v8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s13, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v5, v9
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s23, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: s_bfe_u32 s28, s23, 0x10010
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v9
+; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, s23
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s23, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s28, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s13, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: s_cselect_b32 s13, s23, s28
+; GFX11-FAKE16-NEXT: s_and_b32 s23, s22, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v36
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s23
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s14
+; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s13, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s14, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v71, v37, 16, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s12
+; GFX11-FAKE16-NEXT: s_bfe_u32 s15, s14, 0x10010
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-FAKE16-NEXT: s_add_i32 s15, s15, s14
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s14, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s15, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s13, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s13, s14, s15
+; GFX11-FAKE16-NEXT: s_lshl_b32 s14, s22, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s14
+; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v38
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s13, 16
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s14, v10
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v70, v2, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v9
+; GFX11-FAKE16-NEXT: s_bfe_u32 s12, s14, 0x10010
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v8
+; GFX11-FAKE16-NEXT: s_add_i32 s12, s12, s14
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s14, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s12, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s15, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s12, s14, s12
+; GFX11-FAKE16-NEXT: s_and_b32 s14, s25, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v10
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s12, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2
+; GFX11-FAKE16-NEXT: s_bfe_u32 s14, s9, 0x10010
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: s_add_i32 s14, s14, s9
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s9, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s14, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s9, s9, s14
+; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s25, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v3, v4
+; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s9, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v8
+; GFX11-FAKE16-NEXT: s_bfe_u32 s12, s8, 0x10010
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v9, 16, 1
+; GFX11-FAKE16-NEXT: s_add_i32 s12, s12, s8
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s8, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s12, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s9, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_cselect_b32 s8, s8, s12
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s8, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s9
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v12, v9
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s6
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v9
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s0, s5
+; GFX11-FAKE16-NEXT: s_bfe_u32 s9, s7, 0x10010
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v3
+; GFX11-FAKE16-NEXT: s_add_i32 s9, s9, s7
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s7, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s7, s7, s9
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s24, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s8
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s7, 16
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v12, 16, 1
+; GFX11-FAKE16-NEXT: s_bfe_u32 s4, s8, 0x10010
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v2
+; GFX11-FAKE16-NEXT: s_add_i32 s4, s4, s8
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s8, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s4, s8, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s6, s27, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v52, 0x40c00000, s6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v12
+; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s4, 16
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v52
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v9, 16, 1
+; GFX11-FAKE16-NEXT: s_bfe_u32 s7, s6, 0x10010
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: s_add_i32 s7, s7, s6
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s6, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s4, s6, s7
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s27, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v4, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s4, 16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v49
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v51
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v66, v1, 16, v11
+; GFX11-FAKE16-NEXT: s_bfe_u32 s7, s6, 0x10010
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: s_add_i32 s7, s7, s6
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s6, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s4, s6, s7
+; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s4, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v52
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v39
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v55, v50, 16, v4
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s22, s13
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v54, v2, 16, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v67, v48, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[17:18]
+; GFX11-FAKE16-NEXT: s_bfe_u32 s5, s6, 0x10010
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[9:10], 24, v[15:16]
+; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, s6
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s6, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s14, s6, s5
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s26, 16
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s20, s10
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s14, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[10:11], 24, v[13:14]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[6:7]
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s1, s58
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[1:2], 24, v[54:55]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[2:3], 24, v[66:67]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[70:71]
+; GFX11-FAKE16-NEXT: s_bfe_u32 s10, s11, 0x10010
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[4:5], 24, v[19:20]
+; GFX11-FAKE16-NEXT: s_add_i32 s10, s10, s11
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s11, 22
+; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x7fff
+; GFX11-FAKE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 s10, s11, s10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s19, s60
+; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s10, 16
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s18, s40
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s23, s62
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 24, v55
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 8, v55
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v54
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 8, v54
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 24, v67
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 8, v67
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v66
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 8, v66
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 24, v71
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 8, v71
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v70
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 8, v70
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 24, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 8, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 8, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 8, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 8, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 8, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 8, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 8, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 8, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 8, v6
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s21, s61
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s25, s63
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s57, s27, s73
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s56, s26, s13
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s24, s12
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[94:95], s[8:9], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[4:5], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[14:15], s[46:47], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[40:41], s[44:45], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[42:43], s[28:29], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 vcc, s[56:57], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[34:35], s[10:11], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[30:31], s[6:7], 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s57, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s57, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s56, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s56, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s11, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s11, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s10, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s10, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s9, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s9, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s8, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s8, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s7, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s7, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s6, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s6, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s5, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s5, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s4, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s4, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s47, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s47, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s46, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s46, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s95, s45, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s45, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s99, s44, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s100, s44, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s101, s29, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s102, s29, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s103, s28, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s104, s28, 8
+; GFX11-FAKE16-NEXT: s_branch .LBB91_5
+; GFX11-FAKE16-NEXT: .LBB91_3:
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr104
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr103
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr102
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr101
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr100
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr99
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr96
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr82
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr83
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr80
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr97
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr84
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr98
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr81
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr86
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr87
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr85
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr94
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 0
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s5, 1
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 2
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s5, 3
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s74, 4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s75, 5
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s74, 6
+; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s75, 7
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74
+; GFX11-FAKE16-NEXT: s_branch .LBB91_2
+; GFX11-FAKE16-NEXT: .LBB91_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s94 :: v_dual_mov_b32 v11, s30
+; GFX11-FAKE16-NEXT: v_readlane_b32 s94, v43, 2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v96, s37 :: v_dual_mov_b32 v87, s34
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s49 :: v_dual_mov_b32 v7, s35
+; GFX11-FAKE16-NEXT: v_readlane_b32 s95, v43, 3
+; GFX11-FAKE16-NEXT: v_readlane_b32 vcc_lo, v43, 6
+; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v43, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v43, 4
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s44 :: v_dual_mov_b32 v51, s45
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v49, s46
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v39, s47 :: v_dual_mov_b32 v48, s98
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s56 :: v_dual_mov_b32 v37, s97
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s57 :: v_dual_mov_b32 v35, s58
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s59 :: v_dual_mov_b32 v33, s9
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s60 :: v_dual_mov_b32 v31, s61
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s8 :: v_dual_mov_b32 v29, s62
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, s63 :: v_dual_mov_b32 v28, s96
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s72 :: v_dual_mov_b32 v25, s7
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s73 :: v_dual_mov_b32 v23, s28
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, s29 :: v_dual_mov_b32 v22, s6
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v53, s87 :: v_dual_mov_b32 v54, s86
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s85 :: v_dual_mov_b32 v12, s5
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v65, s4 :: v_dual_mov_b32 v66, s48
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v55, s81 :: v_dual_mov_b32 v64, s84
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v69, s83 :: v_dual_mov_b32 v70, s82
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v67, s70 :: v_dual_mov_b32 v68, s80
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v80, s71 :: v_dual_mov_b32 v19, s39
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v71, s66 :: v_dual_mov_b32 v20, s69
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v82, s68 :: v_dual_mov_b32 v17, s67
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v81, s55 :: v_dual_mov_b32 v18, s65
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v84, s38 :: v_dual_mov_b32 v15, s64
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v83, s51 :: v_dual_mov_b32 v16, s54
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v86, s53 :: v_dual_mov_b32 v13, s52
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v85, s36 :: v_dual_mov_b32 v14, s50
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s74 :: v_dual_mov_b32 v2, s76
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s78 :: v_dual_mov_b32 v4, s88
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s90 :: v_dual_mov_b32 v9, s92
+; GFX11-FAKE16-NEXT: s_mov_b32 s58, s11
+; GFX11-FAKE16-NEXT: v_readlane_b32 s59, v43, 8
+; GFX11-FAKE16-NEXT: v_readlane_b32 s72, v43, 9
+; GFX11-FAKE16-NEXT: v_readlane_b32 s60, v43, 10
+; GFX11-FAKE16-NEXT: v_readlane_b32 s61, v43, 11
+; GFX11-FAKE16-NEXT: v_readlane_b32 s62, v43, 12
+; GFX11-FAKE16-NEXT: v_readlane_b32 s63, v43, 13
+; GFX11-FAKE16-NEXT: v_readlane_b32 s73, v43, 14
+; GFX11-FAKE16-NEXT: v_readlane_b32 s13, v43, 15
+; GFX11-FAKE16-NEXT: v_readlane_b32 s15, v43, 16
+; GFX11-FAKE16-NEXT: v_readlane_b32 s41, v43, 17
+; GFX11-FAKE16-NEXT: v_readlane_b32 s43, v43, 18
+; GFX11-FAKE16-NEXT: v_readlane_b32 s56, v43, 19
+; GFX11-FAKE16-NEXT: v_readlane_b32 s11, v43, 20
+; GFX11-FAKE16-NEXT: v_readlane_b32 s57, v43, 21
+; GFX11-FAKE16-NEXT: v_readlane_b32 s10, v43, 22
+; GFX11-FAKE16-NEXT: v_readlane_b32 s74, v43, 23
+; GFX11-FAKE16-NEXT: v_readlane_b32 s9, v43, 24
+; GFX11-FAKE16-NEXT: v_readlane_b32 s75, v43, 25
+; GFX11-FAKE16-NEXT: v_readlane_b32 s8, v43, 26
+; GFX11-FAKE16-NEXT: v_readlane_b32 s76, v43, 27
+; GFX11-FAKE16-NEXT: v_readlane_b32 s77, v43, 28
+; GFX11-FAKE16-NEXT: v_readlane_b32 s78, v43, 29
+; GFX11-FAKE16-NEXT: v_readlane_b32 s79, v43, 30
+; GFX11-FAKE16-NEXT: v_readlane_b32 s88, v43, 31
+; GFX11-FAKE16-NEXT: v_readlane_b32 s89, v42, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s90, v42, 1
+; GFX11-FAKE16-NEXT: v_readlane_b32 s91, v42, 2
+; GFX11-FAKE16-NEXT: v_readlane_b32 s92, v42, 3
+; GFX11-FAKE16-NEXT: v_readlane_b32 s47, v42, 4
+; GFX11-FAKE16-NEXT: v_readlane_b32 s93, v42, 5
+; GFX11-FAKE16-NEXT: v_readlane_b32 vcc_hi, v43, 7
+; GFX11-FAKE16-NEXT: v_readlane_b32 s46, v42, 6
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v43, 1
+; GFX11-FAKE16-NEXT: v_readlane_b32 s95, v42, 7
+; GFX11-FAKE16-NEXT: v_readlane_b32 s45, v42, 8
+; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v43, 5
+; GFX11-FAKE16-NEXT: .LBB91_5: ; %end
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s104, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s103, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s42, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s4
+; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s6
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s102, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s6, s58, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s101, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s5
+; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s7
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s4
+; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s5
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s100, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s99, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s40, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s4
+; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s6
+; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s45, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s6, s59, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s95, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s5
+; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s7
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16
+; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s4
+; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s5
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v97, s0 :: v_dual_mov_b32 v98, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v99, s2 :: v_dual_mov_b32 v100, s3
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s46, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s93, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s14, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
+; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s17, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s47, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s72, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s92, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
+; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s91, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s90, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s12, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s19, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s89, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s6, s60, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s88, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s4, s4, s5
+; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s7
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16
+; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v112, s0 :: v_dual_mov_b32 v113, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v114, s2 :: v_dual_mov_b32 v115, s3
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s20, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s79, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s78, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s30, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
+; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s21, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s77, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s61, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s76, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
+; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s22, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s8, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s75, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s94, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s23, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s9, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s6, s62, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s74, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: s_or_b32 s4, s4, s5
+; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s7
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16
+; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[97:100], off
+; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[112:115], off offset:16
+; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v97, s0 :: v_dual_mov_b32 v98, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v99, s2 :: v_dual_mov_b32 v100, s3
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s10, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s57, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s34, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
+; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s11, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s25, 0xff
+; GFX11-FAKE16-NEXT: s_and_b32 s3, s63, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s56, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2
+; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s3, s26, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s43, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2
+; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s3, s41, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, vcc_lo, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s15, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s27, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s13, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s4, s4, s5
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s73, 0xff
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v112, s0 :: v_dual_and_b32 v23, 0xff, v23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v113, s1 :: v_dual_lshlrev_b32 v6, 8, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v114, s2 :: v_dual_lshlrev_b32 v11, 8, v11
+; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v115, s3 :: v_dual_and_b32 v96, 0xff, v96
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v23, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 8, v7
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v96, v11
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 8, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 8, v14
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v15
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v16
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 8, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v6, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v22
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v87
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v26
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v86
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v7
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v11, v21
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v22, v13
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v26, v10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v24, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v25
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v85
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v84
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v27
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v28
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v83
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v21
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v22, v15
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v24, v9
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v25, v16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v26, v27
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v6, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v11, v10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v13, v14
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v15, v9
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v32
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 8, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v82
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v31
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 8, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v30
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v81
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v35
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v13, v14
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v15, v16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v17, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v80
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v34
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v33
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v71
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v38
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v70
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v69
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v14, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v15, v16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v17, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v19, v20
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v21, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v9, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v10, v11
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v15
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v36
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 8, v68
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v37
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v67
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v49
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v66
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v65
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v39
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v64
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v10, v11
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v16, v17
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v18, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v19, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v48
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v55
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v52
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v54
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v53
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 8, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v51
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 8, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v50
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 8, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v17
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v18, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v20, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v21, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v22, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v3, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v10, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v11, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v17, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v12, v5
+; GFX11-FAKE16-NEXT: s_clause 0x5
+; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[97:100], off offset:32
+; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[112:115], off offset:48
+; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[23:26], off offset:64
+; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[6:9], off offset:80
+; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:96
+; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:112
+; GFX11-FAKE16-NEXT: v_readlane_b32 s104, v41, 8
+; GFX11-FAKE16-NEXT: v_readlane_b32 s103, v41, 7
+; GFX11-FAKE16-NEXT: v_readlane_b32 s102, v41, 6
+; GFX11-FAKE16-NEXT: v_readlane_b32 s101, v41, 5
+; GFX11-FAKE16-NEXT: v_readlane_b32 s100, v41, 4
+; GFX11-FAKE16-NEXT: v_readlane_b32 s99, v41, 3
+; GFX11-FAKE16-NEXT: v_readlane_b32 s98, v41, 2
+; GFX11-FAKE16-NEXT: v_readlane_b32 s97, v41, 1
+; GFX11-FAKE16-NEXT: v_readlane_b32 s96, v41, 0
+; GFX11-FAKE16-NEXT: v_readlane_b32 s87, v40, 31
+; GFX11-FAKE16-NEXT: v_readlane_b32 s86, v40, 30
+; GFX11-FAKE16-NEXT: v_readlane_b32 s85, v40, 29
+; GFX11-FAKE16-NEXT: v_readlane_b32 s84, v40, 28
+; GFX11-FAKE16-NEXT: v_readlane_b32 s83, v40, 27
+; GFX11-FAKE16-NEXT: v_readlane_b32 s82, v40, 26
+; GFX11-FAKE16-NEXT: v_readlane_b32 s81, v40, 25
+; GFX11-FAKE16-NEXT: v_readlane_b32 s80, v40, 24
+; GFX11-FAKE16-NEXT: v_readlane_b32 s71, v40, 23
+; GFX11-FAKE16-NEXT: v_readlane_b32 s70, v40, 22
+; GFX11-FAKE16-NEXT: v_readlane_b32 s69, v40, 21
+; GFX11-FAKE16-NEXT: v_readlane_b32 s68, v40, 20
+; GFX11-FAKE16-NEXT: v_readlane_b32 s67, v40, 19
+; GFX11-FAKE16-NEXT: v_readlane_b32 s66, v40, 18
+; GFX11-FAKE16-NEXT: v_readlane_b32 s65, v40, 17
+; GFX11-FAKE16-NEXT: v_readlane_b32 s64, v40, 16
+; GFX11-FAKE16-NEXT: v_readlane_b32 s55, v40, 15
+; GFX11-FAKE16-NEXT: v_readlane_b32 s54, v40, 14
+; GFX11-FAKE16-NEXT: v_readlane_b32 s53, v40, 13
+; GFX11-FAKE16-NEXT: v_readlane_b32 s52, v40, 12
+; GFX11-FAKE16-NEXT: v_readlane_b32 s51, v40, 11
+; GFX11-FAKE16-NEXT: v_readlane_b32 s50, v40, 10
+; GFX11-FAKE16-NEXT: v_readlane_b32 s49, v40, 9
+; GFX11-FAKE16-NEXT: v_readlane_b32 s48, v40, 8
+; GFX11-FAKE16-NEXT: v_readlane_b32 s39, v40, 7
+; GFX11-FAKE16-NEXT: v_readlane_b32 s38, v40, 6
+; GFX11-FAKE16-NEXT: v_readlane_b32 s37, v40, 5
+; GFX11-FAKE16-NEXT: v_readlane_b32 s36, v40, 4
+; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v40, 3
+; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v40, 2
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
+; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s0, -1
+; GFX11-FAKE16-NEXT: s_clause 0x3
+; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:4
+; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:8
+; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:12
+; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -181165,9 +185691,10 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8
; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e64 v5, 0xffff, s5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_and_b32 v1, 0xff, v35
; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
@@ -181183,6 +185710,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32
; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
@@ -181193,201 +185721,169 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v68
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v33
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v64
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v66
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v4, v67
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v65
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v38
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v6, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v66
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v37
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v70
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v34
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v65
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v118
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v67
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v38
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v68
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v69
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v39
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v70
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v50
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v71
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v48
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v69
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v82
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v7, v80
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v8, v81
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v9, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v10, 16, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v84
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v54
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v86
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v83
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v49
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v80
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v3, v82
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v55
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v81
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v2, v71
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v51
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v53
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v83
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v3, v86
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v1, v84
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v96
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v85
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v10, v97
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v87
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v99
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v103
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v114
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v98
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v0, 16, v12
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v96
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v85
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v98
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v87
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v97
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v102
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v103
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v101
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v100
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v113
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v101
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v116
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v14, v128
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v114
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v1, v113
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v117
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v112
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v117
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v102
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v130
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v133
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v14, v132
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v0, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v116
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v128
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v134
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v132
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v133
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v3, v130
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v161
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v129
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v147
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v148
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v118
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v129
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v161
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v2, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v166
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v144
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v134
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v147
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v167
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v17, 16, v19
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v18, 16, v22
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v166
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v144
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v167
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v151
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v149
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v20, 16, v21
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v180
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v177
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v180
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v149
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v177
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v165
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v162
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v42
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v41
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v42
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v41
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v178
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v115
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v45
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v44
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v45
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v44
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v131
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v119
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v59
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v56
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v59
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v119
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v56
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v135
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v60
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v61
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v60
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v135
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v61
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v150
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v63
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v62
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v63
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v146
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v62
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v160
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v73
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v72
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v73
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v160
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v72
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v176
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v164
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v75
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v74
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v75
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v164
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v74
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v181
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v179
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v77
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v76
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v77
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v179
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v76
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v78
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v79
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v78
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v182
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v79
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v43
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v40
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v89
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v88
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v89
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v40
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v88
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v47
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v46
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v91
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v90
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v91
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v90
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v58
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v92
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v93
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v92
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v57
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v93
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s5
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB93_3
; GFX11-TRUE16-NEXT: .LBB93_2: ; %cmp.true
@@ -181427,57 +185923,59 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10
; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0
; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2
+; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v57
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v58
-; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v57
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v47
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v46
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v92, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v46
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v93, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v92, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v91, v2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v43
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v93, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v90, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v40
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v43, 0x300, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v90, v3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v183
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v182
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v182, 0x300, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v89, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v181
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v88, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v181, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v78, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v79, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v181, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v179
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v182, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v179, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v77, v3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
@@ -181486,7 +185984,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v164
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v163
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v163, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v76, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
@@ -181497,18 +185995,18 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v74, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v73, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v150
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v150, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v72, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v146
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v145
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v135
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v135, 0x300, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v63, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
@@ -181516,13 +186014,13 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v131
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v62, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v60, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v61, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v119
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v135, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v59, v3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
@@ -181531,29 +186029,29 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v115
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v165
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v115, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v56, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v162
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v45, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v115, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v44, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v42, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v151
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v41, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v149
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v148
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v144
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v180, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
@@ -181567,8 +186065,8 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v133, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v129
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v161, v3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v118
@@ -181576,167 +186074,141 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v117
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v116
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v116, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v147, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v114
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v99
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v134, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v132, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v130, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v103
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v98
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v54
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v103
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v103, 0x300, v0
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v98
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v128, v3
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v99
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v54
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v39
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v52
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v113, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 3, v35
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 3, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v113, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v128, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v100
-; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v101, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v102, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v101, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v102, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v96
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v134, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v97, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v55
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v96
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v97, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v55
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v100
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xff, v33
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v87, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v51
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v86, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v51, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v85, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v84, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v52
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v50
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v50, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v83, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v48
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v49
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v39
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v39, 0x300, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v87, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v82, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v38
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v81, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v71, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v80, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v51
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v86, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v85, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v84, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v51, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v50
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v50, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v49
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v83, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v82, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v81, v5
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v38, 0x300, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v71, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v80, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v37
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v39, 0x300, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v70, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v34
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v35
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v69, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v34
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v112, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v68, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v67, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v66, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v32
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v65, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e64 v8, 0xffff, s4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v70, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v36
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v36
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v37
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v22
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 3, v32
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v69, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v34
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v35
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v32
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v112, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v67, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v68, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, v66, v34
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v32, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v38
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v34, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v39, 16, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v50, 16, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v9, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v15, 16, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v116
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v129
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v36
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v114, 16, v32
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v144, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v115
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v135
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v131
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v145, 16, v32
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v119, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v163
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v182
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v181
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v2, 16, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v65, v33
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v34
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v36.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v32
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v33.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v0.l
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v133, 16, v19
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v160, 16, v32
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v179, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v103.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v114.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v18.h, v129.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v19.h, v133.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v20.h, v144.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v21.h, v145.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v115.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v119.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v24.h, v131.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v25.h, v135.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v26.h, v150.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v27.h, v160.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v28.h, v179.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v29.h, v181.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.h, v182.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v43.l
; GFX11-TRUE16-NEXT: .LBB93_3: ; %end
; GFX11-TRUE16-NEXT: s_clause 0x1e
; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:320
@@ -186713,69 +191185,69 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr176_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr167_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
@@ -186784,95 +191256,91 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB94_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[15:16]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[7:8]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[13:14]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[11:12]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[7:8]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[5:6]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[5:6]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[3:4]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 24, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[27:28]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 24, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 24, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v27
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 24, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v21
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 24, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[15:16]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[1:2]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[21:22]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[17:18]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 8, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 8, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v32
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v32
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 8, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[3:4]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[1:2]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[25:26]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[23:24]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[21:22]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[19:20]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[17:18]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 8, v17
; GFX11-TRUE16-NEXT: .LBB94_2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB94_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_pk_add_f16 v32, 0x200, v32 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
@@ -186883,345 +191351,283 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[7:8]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[15:16]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[13:14]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[11:12]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[7:8]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[5:6]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[5:6]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[27:28]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[3:4]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[3:4]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[15:16]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[1:2]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[21:22]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[17:18]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 24, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[25:26]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[23:24]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[21:22]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[19:20]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[17:18]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 24, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 24, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v27
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 24, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v21
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 24, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 8, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 8, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v32
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v32
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 8, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 8, v17
; GFX11-TRUE16-NEXT: .LBB94_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v166.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v176.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v80.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v165.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v164.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v167.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v166.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v165.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v70.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v33.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v163.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v69.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v35.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v36.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v160.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v150.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v69.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v148.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v37.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v149.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v66.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v134.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v135.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v133.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v37.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v133.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v129.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v119.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v115.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v113.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v101.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v99.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v87.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v148.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v144.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v134.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v128.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v116.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v114.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v102.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v100.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v96.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v86.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v85.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v83.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v82.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v81.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v118.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v116.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v51.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v114.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v112.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v128.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v9.h, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v5.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v6.h, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v101.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v100.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v98.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v96.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v10.l, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v36.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v86.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v84.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v35.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v67.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.h, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v164.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v163.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v161.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v151.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v13.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v14.l, v14.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v149.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v147.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v145.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v135.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.h, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v14.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v15.h, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v16.h, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v132.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v131.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v130.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v119.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v17.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v18.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v19.l, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v25.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v117.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v26.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v115.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v113.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v37.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.h, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v19.h, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v20.h, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v21.h, v22.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v103.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v102.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v99.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v97.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v21.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v22.l, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v23.l, v23.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v24.l, v24.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v87.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v85.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 8, v83.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v82.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.h, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v24.h, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v25.h, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v26.h, v27.l
; GFX11-TRUE16-NEXT: s_clause 0x5
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:64
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:80
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off offset:96
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[29:32], off offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v64f16_to_v128i8:
@@ -203708,9 +208114,10 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8
; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e64 v5, 0xffff, s5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_and_b32 v1, 0xff, v35
; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
@@ -203726,6 +208133,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32
; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
@@ -203736,201 +208144,169 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v68
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v33
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v64
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v66
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v4, v67
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v65
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v38
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v6, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v66
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v37
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v70
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v34
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v65
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v118
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v67
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v38
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v68
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v69
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v39
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v70
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v50
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v71
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v48
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v69
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v82
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v7, v80
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v8, v81
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v9, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v10, 16, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v84
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v54
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v86
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v83
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v49
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v80
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v3, v82
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v55
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v81
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v2, v71
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v51
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v53
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v83
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v3, v86
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v1, v84
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v96
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v85
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v10, v97
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v87
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v99
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v103
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v114
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v98
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v0, 16, v12
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v96
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v85
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v98
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v87
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v97
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v102
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v103
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v101
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v100
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v113
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v101
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v116
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v14, v128
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v114
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v1, v113
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v117
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v112
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v117
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v102
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v130
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v133
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v14, v132
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v0, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v116
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v128
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v134
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v132
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v133
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v3, v130
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v161
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v129
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v147
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v148
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v118
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v129
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v161
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v2, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v166
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v144
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v134
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v147
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v167
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v17, 16, v19
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v18, 16, v22
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v166
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v144
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v167
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v151
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v149
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v20, 16, v21
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v180
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v177
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v180
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v149
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v177
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v165
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v162
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v42
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v41
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v42
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v41
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v178
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v115
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v45
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v44
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v45
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v44
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v131
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v119
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v59
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v56
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v59
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v119
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v56
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v135
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v60
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v61
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v60
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v135
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v61
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v150
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v63
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v62
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v63
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v146
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v62
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v160
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v73
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v72
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v73
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v160
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v72
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v176
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v164
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v75
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v74
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v75
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v164
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v74
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v181
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v179
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v77
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v76
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v77
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v179
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v76
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v78
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v79
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v78
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v182
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v79
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v43
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v40
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v89
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v88
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v89
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v40
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v88
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v47
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v46
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v91
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v90
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v91
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v90
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v58
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v92
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v93
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v92
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v57
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v93
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s5
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB97_3
; GFX11-TRUE16-NEXT: .LBB97_2: ; %cmp.true
@@ -203970,57 +208346,59 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10
; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0
; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2
+; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v57
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v58
-; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v57
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v47
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v46
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v92, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v46
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v93, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v92, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v91, v2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v43
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v93, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v90, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v40
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v43, 0x300, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v90, v3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v183
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v182
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v182, 0x300, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v89, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v181
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v88, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v181, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v78, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v79, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v181, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v179
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v182, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v179, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v77, v3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
@@ -204029,7 +208407,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v164
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v163
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v163, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v76, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
@@ -204040,18 +208418,18 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v74, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v73, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v150
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v150, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v72, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v146
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v145
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v135
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v135, 0x300, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v63, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
@@ -204059,13 +208437,13 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v131
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v62, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v60, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v61, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v119
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v135, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v59, v3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
@@ -204074,29 +208452,29 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v115
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v165
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v115, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v56, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v162
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v45, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v115, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v44, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v42, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v151
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v41, v3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v149
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v148
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v144
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v180, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
@@ -204110,8 +208488,8 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v133, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v129
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v161, v3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v118
@@ -204119,167 +208497,141 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v117
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v116
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v116, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v147, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v114
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v99
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v134, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v132, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v130, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v103
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v98
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v54
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v103
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v103, 0x300, v0
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v98
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v128, v3
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v99
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v54
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v39
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v52
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v113, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v53
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 3, v35
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 3, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v113, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v128, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v100
-; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v101, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v102, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v101, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v102, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v96
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v134, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v97, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v55
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v96
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v97, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v55
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v100
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xff, v33
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v87, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v52
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v51
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v86, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v51, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v85, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v84, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v50
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v50, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v83, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v48
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v49
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v39
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v39, 0x300, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v87, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v82, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v38
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v81, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v71, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v80, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v51
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v86, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v85, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v84, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v51, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v50
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v50, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v49
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v83, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v82, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v81, v5
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v38, 0x300, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v71, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v80, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v37
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v39, 0x300, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v70, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v34
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v35
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v69, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v34
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v112, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v68, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v67, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v66, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v32
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v65, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e64 v8, 0xffff, s4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v70, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v36
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v36
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v37
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v22
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 3, v32
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v69, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v34
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v35
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v32
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v112, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v67, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v68, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, v66, v34
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v32, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v38
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v34, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v39, 16, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v50, 16, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v9, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v15, 16, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v116
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v129
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v36
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v114, 16, v32
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v144, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v115
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v135
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v131
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v145, 16, v32
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v119, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v163
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v182
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v181
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v2, 16, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v65, v33
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v34
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v36.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v32
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v33.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v0.l
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v133, 16, v19
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v160, 16, v32
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v179, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v103.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v114.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v18.h, v129.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v19.h, v133.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v20.h, v144.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v21.h, v145.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v115.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v119.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v24.h, v131.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v25.h, v135.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v26.h, v150.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v27.h, v160.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v28.h, v179.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v29.h, v181.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.h, v182.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v43.l
; GFX11-TRUE16-NEXT: .LBB97_3: ; %end
; GFX11-TRUE16-NEXT: s_clause 0x1e
; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:320
@@ -209415,69 +213767,69 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr176_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr167_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
@@ -209486,95 +213838,91 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[15:16]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[7:8]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[13:14]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[11:12]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[7:8]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[5:6]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[5:6]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[3:4]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 24, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[27:28]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 24, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 24, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v27
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 24, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v21
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 24, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[15:16]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[1:2]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[21:22]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[17:18]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 8, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 8, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v32
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v32
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 8, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[3:4]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[1:2]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[25:26]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[23:24]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[21:22]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[19:20]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[17:18]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 8, v17
; GFX11-TRUE16-NEXT: .LBB98_2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
@@ -209585,345 +213933,283 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[7:8]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[15:16]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[13:14]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[11:12]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[7:8]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[5:6]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[5:6]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[27:28]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[3:4]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[3:4]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[15:16]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[1:2]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[21:22]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[17:18]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 24, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[25:26]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[23:24]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[21:22]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[19:20]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[17:18]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 24, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 24, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v27
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 24, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v21
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 24, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 8, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 8, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v32
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v32
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 8, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 8, v17
; GFX11-TRUE16-NEXT: .LBB98_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v166.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v176.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v80.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v165.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v164.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v167.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v166.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v165.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v70.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v33.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v163.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v69.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v35.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v36.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v160.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v150.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v69.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v148.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v37.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v149.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v66.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v134.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v135.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v133.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v37.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v133.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v129.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v119.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v115.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v113.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v101.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v99.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v87.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v148.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v144.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v134.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v128.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v116.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v114.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v102.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v100.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v96.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v86.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v85.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v83.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v82.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v81.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v118.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v116.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v51.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v114.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v112.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v128.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v9.h, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v5.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v6.h, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v101.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v100.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v98.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v96.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v10.l, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v36.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v86.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v84.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v35.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v67.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.h, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v164.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v163.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v161.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v151.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v13.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v14.l, v14.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v149.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v147.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v145.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v135.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.h, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v14.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v15.h, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v16.h, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v132.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v131.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v130.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v119.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v17.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v18.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v19.l, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v25.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v117.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v26.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v115.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v113.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v37.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.h, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v19.h, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v20.h, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v21.h, v22.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v103.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v102.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v99.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v97.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v21.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v22.l, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v23.l, v23.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v24.l, v24.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v87.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v85.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 8, v83.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v82.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.h, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v24.h, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v25.h, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v26.h, v27.l
; GFX11-TRUE16-NEXT: s_clause 0x5
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:64
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:80
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off offset:96
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[29:32], off offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v64i16_to_v128i8:
@@ -222020,700 +226306,1362 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v64bf16_to_v64f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12
-; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
-; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8
-; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6
-; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
-; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
-; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
-; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: s_mov_b32 s15, s3
-; GFX11-NEXT: s_mov_b32 s14, s2
-; GFX11-NEXT: s_mov_b32 s13, s1
-; GFX11-NEXT: s_mov_b32 s12, s0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB101_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB101_4
-; GFX11-NEXT: .LBB101_2: ; %cmp.true
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v17
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v16
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v18
-; GFX11-NEXT: s_and_b32 s0, s12, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1
-; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v6, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v3, 16, v17
-; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_cndmask_b32 v0, v5, v8
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_bfe_u32 v11, v3, 16, 1
-; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v9, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v10, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32
-; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v5, v4
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v19
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v11, v3
-; GFX11-NEXT: v_lshl_or_b32 v16, v16, 16, v32
-; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v3, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v4, v5
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v20
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v20
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v19
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v2, v4
-; GFX11-NEXT: v_dual_add_f32 v2, 0x40c00000, v6 :: v_dual_add_f32 v3, 0x40c00000, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v8, v3
-; GFX11-NEXT: v_bfe_u32 v8, v2, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v5
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v5 :: v_dual_and_b32 v6, 0xffff0000, v22
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v8, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v5, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v1, v3, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v3
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v3
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_add_nc_u32 v1, 0x7fff, v1
-; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v0
-; GFX11-NEXT: v_bfe_u32 v0, v6, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v23
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_add_nc_u32 v2, v4, v5
-; GFX11-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_add_f32_e32 v5, 0x40c00000, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v36, 0xffff, v36
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v23
-; GFX11-NEXT: v_lshl_or_b32 v19, v19, 16, v34
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v1
-; GFX11-NEXT: v_bfe_u32 v1, v5, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v2, v4 :: v_dual_add_nc_u32 v0, v1, v5
-; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v24
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v5
-; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v1
-; GFX11-NEXT: v_bfe_u32 v1, v3, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v2
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_and_b32 v7, 0xffff0000, v25
-; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v5
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v5 :: v_dual_add_nc_u32 v5, 0x7fff, v6
-; GFX11-NEXT: v_add_f32_e32 v6, 0x40c00000, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v3, v6, 16, 1
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_lshlrev_b32 v2, 16, v25
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1
-; GFX11-NEXT: v_add_f32_e32 v0, 0x40c00000, v2
-; GFX11-NEXT: v_dual_cndmask_b32 v4, v5, v7 :: v_dual_add_nc_u32 v1, v3, v6
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v26
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v26
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_nc_u32 v3, v3, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0
-; GFX11-NEXT: v_and_b32_e32 v39, 0xffff, v39
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v5, v2
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v6 :: v_dual_and_b32 v5, 0xffff0000, v27
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v27
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v7, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX11-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v0
-; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v2, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v4, v6
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v28
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v3, 16, v28
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_and_b32_e32 v49, 0xffff, v49
-; GFX11-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_cndmask_b32 v0, v0, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_bfe_u32 v4, v2, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v4, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v29
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v5, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v29
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v5, 0x40c00000, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v8, v4
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v2, v5
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v30
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v30
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v3
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v7
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v31
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_add_f32 v1, 0x40c00000, v5
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v31
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v0
-; GFX11-NEXT: v_bfe_u32 v0, v4, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v4
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v3, v5 :: v_dual_add_f32 v3, 0x40c00000, v6
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v7, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT: v_bfe_u32 v2, v3, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v6
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s12, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v0
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s13, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v0
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v4, v6
-; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: s_lshl_b32 s0, s13, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v1
-; GFX11-NEXT: v_bfe_u32 v1, v4, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s14, 0xffff0000
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v6 :: v_dual_add_nc_u32 v5, 0x7fff, v7
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v4
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: s_lshl_b32 s0, s14, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: s_and_b32 s0, s15, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v8, v7
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v7
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v5, v9, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: s_lshl_b32 s0, s15, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v9
-; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v4
-; GFX11-NEXT: v_bfe_u32 v4, v7, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: s_and_b32 s0, s16, 0xffff0000
-; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v8 :: v_dual_add_nc_u32 v8, 0x7fff, v10
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v5
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc_lo
-; GFX11-NEXT: s_lshl_b32 s0, s16, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v7
-; GFX11-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v6
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: s_and_b32 s0, s17, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v9, v5
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s17, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_bfe_u32 v10, v9, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v66, 0xffff, v66
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v7
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v6
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v9
-; GFX11-NEXT: s_and_b32 s0, s18, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v4, v4, v7 :: v_dual_add_nc_u32 v7, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_bfe_u32 v11, v5, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v67, 0xffff, v67
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v54, 0xffff, v54
-; GFX11-NEXT: v_dual_cndmask_b32 v6, v7, v8 :: v_dual_add_nc_u32 v7, 0x7fff, v10
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v9
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v11, v5
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s18, 16
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v7, v7, v8 :: v_dual_add_nc_u32 v8, 0x7fff, v10
-; GFX11-NEXT: v_bfe_u32 v10, v11, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v7
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s19, 0xffff0000
-; GFX11-NEXT: v_dual_cndmask_b32 v5, v8, v9 :: v_dual_add_nc_u32 v8, v10, v11
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v9, v7, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: s_lshl_b32 s0, s19, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
-; GFX11-NEXT: v_bfe_u32 v13, v10, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v7
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s20, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v8, v8, v12, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v13, v13, v10
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v8
-; GFX11-NEXT: v_bfe_u32 v8, v11, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v9, v12, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v13
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v10
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v11
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s20, 16
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v11
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
-; GFX11-NEXT: v_bfe_u32 v12, v13, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v9
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s21, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v12, v13
-; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v11, v9, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: s_lshl_b32 s0, s21, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT: v_bfe_u32 v15, v12, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v9
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s22, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v15, v15, v12
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v10
-; GFX11-NEXT: v_bfe_u32 v10, v13, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v11, v14, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v15
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v12
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v9
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s22, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v13
-; GFX11-NEXT: v_bfe_u32 v14, v9, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v11
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s23, 0xffff0000
-; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v9
-; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v14, v9
-; GFX11-NEXT: v_bfe_u32 v13, v11, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: s_lshl_b32 s0, s23, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v13, v13, v11
-; GFX11-NEXT: v_bfe_u32 v82, v14, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v12, v15, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
-; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v82, v82, v14
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: v_bfe_u32 v83, v12, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s24, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GFX11-NEXT: v_lshl_or_b32 v5, v68, 16, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v82
-; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v14
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v82, v83, v12
-; GFX11-NEXT: v_add_f32_e64 v83, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s24, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v13, v13, v15 :: v_dual_add_nc_u32 v14, 0x7fff, v82
-; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v12
-; GFX11-NEXT: v_bfe_u32 v82, v83, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v13
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s25, 0xffff0000
-; GFX11-NEXT: v_or_b32_e32 v85, 0x400000, v83
-; GFX11-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v14, v82, v83
-; GFX11-NEXT: v_bfe_u32 v15, v13, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v82, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v83, v83
-; GFX11-NEXT: s_lshl_b32 s0, s25, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v15, v15, v13
-; GFX11-NEXT: v_bfe_u32 v86, v82, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v83, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s26, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v14, v14, v85 :: v_dual_add_nc_u32 v15, 0x7fff, v15
-; GFX11-NEXT: v_or_b32_e32 v85, 0x400000, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v86, v86, v82
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v87, 16, v14
-; GFX11-NEXT: v_bfe_u32 v14, v83, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v100, 0x400000, v83
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX11-NEXT: v_cndmask_b32_e32 v13, v15, v85, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v86
-; GFX11-NEXT: v_add_f32_e64 v86, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s26, 16
-; GFX11-NEXT: v_or_b32_e32 v85, 0x400000, v82
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82
-; GFX11-NEXT: v_add_f32_e64 v82, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s27, 16
-; GFX11-NEXT: v_or_b32_e32 v102, 0x400000, v86
-; GFX11-NEXT: v_add_f32_e64 v96, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s27, 0xffff0000
-; GFX11-NEXT: v_bfe_u32 v97, v82, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v15, v15, v85, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v85, v86, 16, 1
-; GFX11-NEXT: v_bfe_u32 v99, v96, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v98, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v97, v97, v82
-; GFX11-NEXT: v_or_b32_e32 v103, 0x400000, v82
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82
-; GFX11-NEXT: v_add_nc_u32_e32 v99, v99, v96
-; GFX11-NEXT: v_add_nc_u32_e32 v85, v85, v86
-; GFX11-NEXT: v_add_nc_u32_e32 v97, 0x7fff, v97
-; GFX11-NEXT: v_bfe_u32 v101, v98, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v112, 0x400000, v96
-; GFX11-NEXT: v_add_nc_u32_e32 v99, 0x7fff, v99
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v82, v97, v103 :: v_dual_add_nc_u32 v85, 0x7fff, v85
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v96, v96
-; GFX11-NEXT: v_add_nc_u32_e32 v101, v101, v98
-; GFX11-NEXT: v_add_nc_u32_e32 v14, v14, v83
-; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v82
-; GFX11-NEXT: v_cndmask_b32_e32 v96, v99, v112, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86
-; GFX11-NEXT: v_add_nc_u32_e32 v97, 0x7fff, v101
-; GFX11-NEXT: v_or_b32_e32 v101, 0x400000, v98
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v96
-; GFX11-NEXT: v_cndmask_b32_e32 v85, v85, v102, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v98, v98
-; GFX11-NEXT: v_and_b32_e32 v82, 0xffff, v82
-; GFX11-NEXT: v_and_b32_e32 v68, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v3, v65, 16, v67
-; GFX11-NEXT: v_dual_cndmask_b32 v86, v97, v101 :: v_dual_and_b32 v65, 0xffff, v28
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v83, v83
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v85
-; GFX11-NEXT: v_lshrrev_b32_e32 v97, 16, v15
-; GFX11-NEXT: v_and_b32_e32 v80, 0xffff, v80
-; GFX11-NEXT: v_lshrrev_b32_e32 v85, 16, v86
-; GFX11-NEXT: v_cndmask_b32_e32 v14, v14, v100, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v86, 0xffff, v96
-; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v68
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v14, v83, 16, v82
-; GFX11-NEXT: v_lshl_or_b32 v15, v85, 16, v86
-; GFX11-NEXT: v_and_b32_e32 v83, 0xffff, v13
-; GFX11-NEXT: v_and_b32_e32 v86, 0xffff, v11
-; GFX11-NEXT: v_and_b32_e32 v82, 0xffff, v96
-; GFX11-NEXT: v_and_b32_e32 v96, 0xffff, v10
-; GFX11-NEXT: v_and_b32_e32 v85, 0xffff, v12
-; GFX11-NEXT: v_lshl_or_b32 v12, v87, 16, v83
-; GFX11-NEXT: v_lshl_or_b32 v10, v9, 16, v86
-; GFX11-NEXT: v_lshl_or_b32 v13, v97, 16, v82
-; GFX11-NEXT: v_and_b32_e32 v82, 0xffff, v7
-; GFX11-NEXT: v_lshl_or_b32 v9, v81, 16, v96
-; GFX11-NEXT: v_and_b32_e32 v81, 0xffff, v8
-; GFX11-NEXT: v_and_b32_e32 v83, 0xffff, v6
-; GFX11-NEXT: v_lshl_or_b32 v11, v84, 16, v85
-; GFX11-NEXT: v_lshl_or_b32 v6, v69, 16, v82
-; GFX11-NEXT: v_and_b32_e32 v69, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v2, v64, 16, v66
-; GFX11-NEXT: v_and_b32_e32 v64, 0xffff, v29
-; GFX11-NEXT: v_lshl_or_b32 v7, v70, 16, v81
-; GFX11-NEXT: v_and_b32_e32 v70, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v0, v55, 16, v69
-; GFX11-NEXT: v_and_b32_e32 v55, 0xffff, v30
-; GFX11-NEXT: v_lshl_or_b32 v28, v51, 16, v64
-; GFX11-NEXT: v_and_b32_e32 v51, 0xffff, v24
-; GFX11-NEXT: v_and_b32_e32 v66, 0xffff, v27
-; GFX11-NEXT: v_lshl_or_b32 v27, v50, 16, v65
-; GFX11-NEXT: v_lshl_or_b32 v29, v52, 16, v55
-; GFX11-NEXT: v_and_b32_e32 v50, 0xffff, v25
-; GFX11-NEXT: v_and_b32_e32 v52, 0xffff, v22
-; GFX11-NEXT: v_lshl_or_b32 v24, v38, 16, v39
-; GFX11-NEXT: v_lshl_or_b32 v22, v37, 16, v51
-; GFX11-NEXT: v_and_b32_e32 v37, 0xffff, v20
-; GFX11-NEXT: v_and_b32_e32 v38, 0xffff, v18
-; GFX11-NEXT: v_lshl_or_b32 v8, v71, 16, v80
-; GFX11-NEXT: v_lshl_or_b32 v4, v4, 16, v83
-; GFX11-NEXT: v_lshl_or_b32 v31, v31, 16, v70
-; GFX11-NEXT: v_lshl_or_b32 v30, v53, 16, v54
-; GFX11-NEXT: v_lshl_or_b32 v26, v26, 16, v66
-; GFX11-NEXT: v_lshl_or_b32 v25, v48, 16, v49
-; GFX11-NEXT: v_lshl_or_b32 v23, v23, 16, v50
-; GFX11-NEXT: v_lshl_or_b32 v21, v21, 16, v52
-; GFX11-NEXT: v_lshl_or_b32 v20, v35, 16, v36
-; GFX11-NEXT: v_lshl_or_b32 v18, v33, 16, v37
-; GFX11-NEXT: v_lshl_or_b32 v17, v17, 16, v38
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB101_3:
-; GFX11-NEXT: s_branch .LBB101_2
-; GFX11-NEXT: .LBB101_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
-; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
-; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v64f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s13, s1
+; GFX11-TRUE16-NEXT: s_mov_b32 s12, s0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB101_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB101_4
+; GFX11-TRUE16-NEXT: .LBB101_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v16
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v18
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s25, 0xffff0000
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s12, 0xffff0000
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v85, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s27, 0xffff0000
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v97, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v87, 0x400000, v85
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v98, v97, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v98, v98, v97
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v98, 0x7fff, v98
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_cndmask_b32 v0, v5, v8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v6, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v32.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v11, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v0
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v3, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v33.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v4, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v20
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v19
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v34.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v2, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v6 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v8, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v5 :: v_dual_and_b32 v6, 0xffff0000, v22
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v5, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v35.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_add_nc_u32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v23
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_add_nc_u32 v2, v4, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v0, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v36.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v23
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v1, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v37.l
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v2, v4 :: v_dual_add_f32 v2, 0x40c00000, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v24
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v7, v2
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_and_b32 v7, 0xffff0000, v25
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v38.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v5 :: v_dual_add_nc_u32 v5, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v39.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_lshlrev_b32 v2, 16, v25
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v5, v7 :: v_dual_add_nc_u32 v1, v3, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v26
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v26
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_nc_u32 v3, v3, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v48.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v2
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v3, v6 :: v_dual_and_b32 v5, 0xffff0000, v27
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v27
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v7, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v2, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v4, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v28
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v3, 16, v28
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v50.l
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v49.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v4, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v29
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v29
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v51.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_cndmask_b32 v0, v0, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v8, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v2, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v30
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v52.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v31
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v53.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v3, v5 :: v_dual_add_f32 v3, 0x40c00000, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v7, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s12, 16
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v4, v5 :: v_dual_add_nc_u32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v54.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v5, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s13, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s13, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v55.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s14, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s14, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v0, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v64.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v7
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s15, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s15, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s16, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v3, v7 :: v_dual_add_nc_u32 v6, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v2, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s16, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s17, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v4, v6 :: v_dual_add_nc_u32 v4, v7, v9
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s17, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v10, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v7, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s18, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v5, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v10
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v4, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v5
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s18, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v10
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v11, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s19, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v68.l
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v6, v8 :: v_dual_add_nc_u32 v6, v9, v11
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s19, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v10
+; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v12, v8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v9, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s20, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v7, v7, v11 :: v_dual_add_nc_u32 v10, 0x7fff, v12
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v6, v9
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v7
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s20, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v12
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v13, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s21, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v70.l
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v7, v8, v10 :: v_dual_add_nc_u32 v8, v11, v13
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s21, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v9, v12
+; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v14, v10
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v11, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s22, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v9, v9, v13 :: v_dual_add_nc_u32 v12, 0x7fff, v14
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v8, v11
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s22, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v14
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v12, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v15, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s23, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v14, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v80.l
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v9, v10, v12 :: v_dual_add_nc_u32 v10, v13, v15
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v15
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s23, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v14
+; GFX11-TRUE16-NEXT: v_bfe_u32 v82, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v82, v82, v12
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v10
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v13, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s24, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v11, v11, v15 :: v_dual_add_nc_u32 v14, 0x7fff, v82
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v82, v10, v13
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v84, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s24, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v82
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v82, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v14, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v84, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s25, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 16, v11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v83.l
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v12, v12, v14 :: v_dual_add_nc_u32 v13, v15, v84
+; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v82, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v85, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v84, v84
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v81.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v84
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v14, v82
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, v15, v85
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v84, 0x400000, v82
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v86.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s26, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
+; GFX11-TRUE16-NEXT: v_bfe_u32 v96, v13, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v82, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v14, v84, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v85, v85
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s26, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v84, v96, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v14
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v15, v87, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v87, v82, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v84
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v99, 0x400000, v82
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 16, v15
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v84, v87, v82
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v87, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s27, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v85.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v87, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v84, 0x7fff, v84
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX11-TRUE16-NEXT: v_bfe_u32 v100, v14, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, v15, v87
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v101, 0x400000, v14
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v82, v84, v99, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v99, 0x400000, v97
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v97, v97
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v84, v100, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v100, 0x400000, v87
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v82
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v97, v98, v99, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v87, v87
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v84, 0x7fff, v84
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v96.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v71.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 16, v97
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v15, v100, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v69.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v66.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v15
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v84, v84, v101, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v82.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v65.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v84
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v87.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB101_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB101_2
+; GFX11-TRUE16-NEXT: .LBB101_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v64f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s15, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s14, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s13, s1
+; GFX11-FAKE16-NEXT: s_mov_b32 s12, s0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB101_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB101_4
+; GFX11-FAKE16-NEXT: .LBB101_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v17
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v18
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s12, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v6, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v3, 16, v17
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_cndmask_b32 v0, v5, v8
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v5, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v19
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v11, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v3, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v4, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v20
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v20
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v19
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v2, v4
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v6 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v8, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v5 :: v_dual_and_b32 v6, 0xffff0000, v22
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v8, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v5, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v3, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_add_nc_u32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v0, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v23
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_add_nc_u32 v2, v4, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v0, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v36, 0xffff, v36
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v19, 16, v34
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v2, v4 :: v_dual_add_nc_u32 v0, v1, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v24
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v2
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_and_b32 v7, 0xffff0000, v25
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v6, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v5 :: v_dual_add_nc_u32 v5, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_lshlrev_b32 v2, 16, v25
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v2
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v5, v7 :: v_dual_add_nc_u32 v1, v3, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v26
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v26
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_nc_u32 v3, v3, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v39
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v5, v2
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v3, v6 :: v_dual_and_b32 v5, 0xffff0000, v27
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v27
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v7, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v2, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v4, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v28
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v3, 16, v28
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v49
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_cndmask_b32 v0, v0, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v4, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v29
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v5, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v29
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v8, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v2, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v30
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v30
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v7
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v31
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_add_f32 v1, 0x40c00000, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v31
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v0, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v0, v4
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v3, v5 :: v_dual_add_f32 v3, 0x40c00000, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v7, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s12, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s13, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v4, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s13, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v4, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s14, 0xffff0000
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v6 :: v_dual_add_nc_u32 v5, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s14, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s15, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v8, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s15, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v9
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v10, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xffff0000
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v5, v8 :: v_dual_add_nc_u32 v8, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc_lo
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s16, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s17, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v9, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s17, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v66
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v10, v9
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s18, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v4, v7 :: v_dual_add_nc_u32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v67
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v54
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v7, v8 :: v_dual_add_nc_u32 v7, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v11, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s18, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v7, v8 :: v_dual_add_nc_u32 v8, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s19, 0xffff0000
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v8, v9 :: v_dual_add_nc_u32 v8, v10, v11
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s19, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s20, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v13, v10
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v9, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v13
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v11
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s20, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v9
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s21, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v12, v13
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s21, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v12, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v11, v9
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s22, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, v15, v12
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v10
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v11, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v15
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v10, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v9
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s22, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v11
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s23, 0xffff0000
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v14, v9
+; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s23, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v13, v11
+; GFX11-FAKE16-NEXT: v_bfe_u32 v82, v14, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v12, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v82, v82, v14
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_bfe_u32 v83, v12, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v68, 16, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v82
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v14
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v82, v83, v12
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v83, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s24, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v13, v13, v15 :: v_dual_add_nc_u32 v14, 0x7fff, v82
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_bfe_u32 v82, v83, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v13
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s25, 0xffff0000
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v85, 0x400000, v83
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v82, v83
+; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v82, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v83, v83
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s25, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, v15, v13
+; GFX11-FAKE16-NEXT: v_bfe_u32 v86, v82, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v83, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s26, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v85 :: v_dual_add_nc_u32 v15, 0x7fff, v15
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v85, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v86, v86, v82
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v14
+; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v83, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v100, 0x400000, v83
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v15, v85, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v86
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v86, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s26, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v85, 0x400000, v82
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v82, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s27, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v102, 0x400000, v86
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v96, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s27, 0xffff0000
+; GFX11-FAKE16-NEXT: v_bfe_u32 v97, v82, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v15, v85, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v85, v86, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v99, v96, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v98, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v97, v97, v82
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v103, 0x400000, v82
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v99, v99, v96
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v85, v85, v86
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v97, 0x7fff, v97
+; GFX11-FAKE16-NEXT: v_bfe_u32 v101, v98, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v96
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v99, 0x7fff, v99
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v82, v97, v103 :: v_dual_add_nc_u32 v85, 0x7fff, v85
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v96, v96
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v101, v101, v98
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v14, v83
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v82
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v96, v99, v112, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v97, 0x7fff, v101
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v101, 0x400000, v98
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v96
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v85, v85, v102, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v98, v98
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v82, 0xffff, v82
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v65, 16, v67
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v86, v97, v101 :: v_dual_and_b32 v65, 0xffff, v28
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v83, v83
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v85
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v80, 0xffff, v80
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 16, v86
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v14, v100, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xffff, v96
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v1, 16, v68
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v83, 16, v82
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v85, 16, v86
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v83, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v82, 0xffff, v96
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v96, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v85, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v87, 16, v83
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v9, 16, v86
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v97, 16, v82
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v82, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v81, 16, v96
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v81, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v83, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v84, 16, v85
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v69, 16, v82
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v64, 16, v66
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v29
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v70, 16, v81
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v70, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v55, 16, v69
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v30
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v51, 16, v64
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v27
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v50, 16, v65
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v52, 16, v55
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v38, 16, v39
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v51
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v71, 16, v80
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v83
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v70
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v53, 16, v54
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v66
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v48, 16, v49
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v23, 16, v50
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v52
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v35, 16, v36
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v33, 16, v37
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v17, 16, v38
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB101_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB101_2
+; GFX11-FAKE16-NEXT: .LBB101_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -228090,568 +233038,496 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v17
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v20
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v23
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v16 :: v_dual_lshlrev_b32 v35, 16, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v24
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v33, 16, 1
-; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v32, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v32
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v52, 16, v24
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 16, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v33, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v32, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v32
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v34
-; GFX11-TRUE16-NEXT: v_add3_u32 v37, v37, v33, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v16, v16, v32, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v33
+; GFX11-TRUE16-NEXT: v_add3_u32 v38, v38, v33, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v32, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, 0x400000, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v30
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v16, v16, v38
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v17
-; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v34, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v34, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v36, 16, 1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v80, 16, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v17, v39, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v26
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v35, v36, vcc_lo
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_lshlrev_b32 v34, 16, v18
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v19
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v17, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v80, 16, v4
+; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v17, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_lshlrev_b32 v82, 16, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 16, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 16, v8
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 16, v30
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
+; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v16, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v16
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v10
+; GFX11-TRUE16-NEXT: v_add3_u32 v37, v37, v16, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v68, 16, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 16, v12
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v37, v39, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v36
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v82, 0x40c00000, v82 :: v_dual_lshlrev_b32 v83, 16, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v36, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_cndmask_b32 v33, v38, v48
+; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v34, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 16, v20
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v17, v35, v37 :: v_dual_and_b32 v6, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v35, v38, v34, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v34
+; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v18, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v18, v37, v32 :: v_dual_add_f32 v37, 0x40c00000, v38
-; GFX11-TRUE16-NEXT: v_add3_u32 v32, v34, v36, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v35, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v35, v38, v18, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v18
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT: v_add3_u32 v38, v39, v36, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v19, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v19
-; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v37, 16, 1
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v83, 0x40c00000, v83 :: v_dual_add_f32 v8, 0x40c00000, v8
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v32, v33, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v33, v34, v35, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v35
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 16, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v48
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-TRUE16-NEXT: v_add3_u32 v37, v37, v19, 0x7fff
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v35, v38, v39 :: v_dual_lshlrev_b32 v38, 16, v21
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v19
+; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v36, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v37, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v37
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v39 :: v_dual_lshlrev_b32 v39, 16, v20
-; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v38, 16, 1
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v86, 0x40c00000, v86 :: v_dual_lshlrev_b32 v87, 16, v12
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v36, 16, 1
-; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v38, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v38
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v39
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v48
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 16, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v87, 0x40c00000, v87 :: v_dual_lshlrev_b32 v96, 16, v13
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v34, v35, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v34, v37, v36, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v36
-; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v39, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v33.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v19, v37, v39, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v37, v48, v36, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v20, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v39
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v48, 0x40c00000, v48 :: v_dual_add_f32 v49, 0x40c00000, v21
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v34, v34, v35 :: v_dual_lshlrev_b32 v21, 16, v22
-; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v39, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v38, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
-; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v48, 16, 1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v50, 0x40c00000, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v97, 0x400000, v87
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v35, v36, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v36, v37, v38, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v38
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v38, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v35.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v32.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v37, v39, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v37, v48, v20, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v20
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT: v_add3_u32 v48, v49, v38, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v38
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v37, v39, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v21, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v50
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v36.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add3_u32 v39, v39, v21, 0x7fff
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v37, v48, v49 :: v_dual_lshlrev_b32 v48, 16, v23
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v21
+; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v38, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v48, 0x40c00000, v48 :: v_dual_cndmask_b32 v21, v39, v49
+; GFX11-TRUE16-NEXT: v_add3_u32 v39, v50, v38, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v38
+; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v22, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v48
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v25
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v36, v37, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v37, v39, v48, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v49, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v51, v48, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v37.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v38, v39, v49, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v39, v50, v22, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v22
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-TRUE16-NEXT: v_add3_u32 v50, v51, v48, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v48
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v22, v39, v49, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v23, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 16, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v29
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 16, v0
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v21, v37, v38, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v37, v39, v49, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v49
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
-; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v50, 16, 1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v22, v37, v38 :: v_dual_lshlrev_b32 v71, 16, v4
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v49, 0x40c00000, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v23
-; GFX11-TRUE16-NEXT: v_add3_u32 v37, v39, v50, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v50
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_lshlrev_b32 v51, 16, v24
-; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v48, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v37, v38, vcc_lo
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v48
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v52
+; GFX11-TRUE16-NEXT: v_add3_u32 v49, v49, v23, 0x7fff
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v39, v50, v51 :: v_dual_lshlrev_b32 v50, 16, v25
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v23
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v52, v48, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v50, 0x40c00000, v50 :: v_dual_cndmask_b32 v23, v49, v51
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add3_u32 v49, v52, v48, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v48
+; GFX11-TRUE16-NEXT: v_bfe_u32 v52, v24, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v51, 0x40c00000, v51
-; GFX11-TRUE16-NEXT: v_add3_u32 v37, v39, v48, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v49, 16, 1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v22.h
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v71, 0x40c00000, v71 :: v_dual_add_f32 v4, 0x40c00000, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v37, v37, v38, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v38, v39, v49, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v49
-; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v50, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v49, 0x40c00000, v52 :: v_dual_lshlrev_b32 v52, 16, v25
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v34, 16, v21
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v24, v38, v39 :: v_dual_and_b32 v5, 0xffff0000, v5
-; GFX11-TRUE16-NEXT: v_add3_u32 v38, v48, v50, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v50
-; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v51, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v53, v50, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v48, v49, v51, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v49, v52, v24, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v24
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-TRUE16-NEXT: v_add3_u32 v52, v53, v50, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v53, 0x400000, v50
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v49, v51, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v51, v25, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v49, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v50, 0x40c00000, v54
+; GFX11-TRUE16-NEXT: v_add3_u32 v51, v51, v25, 0x7fff
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v49, v52, v53 :: v_dual_lshlrev_b32 v52, 16, v27
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v53, 0x400000, v25
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v54, v50, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX11-TRUE16-NEXT: v_add_f32_e32 v52, 0x40c00000, v52
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v80, 0x40c00000, v80 :: v_dual_add_f32 v5, 0x40c00000, v5
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v38, v38, v39, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v39, v48, v51, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v51
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v51, 0x40c00000, v53
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v25, v39, v48, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v39, v50, v49, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v49
-; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v52, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v52
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v81, 0x40c00000, v81 :: v_dual_add_f32 v6, 0x40c00000, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v39, v39, v48, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v48, v50, v52, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v38.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v25, v51, v53, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v51, v54, v50, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v53, 0x400000, v50
+; GFX11-TRUE16-NEXT: v_bfe_u32 v54, v26, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-TRUE16-NEXT: v_bfe_u32 v55, v52, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v27, 0x40c00000, v27
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v49.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v50, v51, v53, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v51, v54, v26, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v53, 0x400000, v26
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-TRUE16-NEXT: v_add3_u32 v54, v55, v52, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v55, 0x400000, v52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v51, v53, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v53, v27, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v26
-; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v51, 16, 1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v84, 0x40c00000, v84 :: v_dual_add_f32 v9, 0x40c00000, v9
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v48, v48, v49, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v53, 0x40c00000, v53 :: v_dual_add_f32 v54, 0x40c00000, v26
-; GFX11-TRUE16-NEXT: v_add3_u32 v49, v50, v51, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, 0x400000, v51
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v27
-; GFX11-TRUE16-NEXT: v_bfe_u32 v52, v53, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v53
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v85, 16, v10
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v49, v49, v50, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v50, v52, v53, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v52, v54, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v55, 0x40c00000, v26
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v53, 0x40c00000, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v85, 0x40c00000, v85 :: v_dual_cndmask_b32 v26, v50, v51
-; GFX11-TRUE16-NEXT: v_add3_u32 v50, v52, v54, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v54
-; GFX11-TRUE16-NEXT: v_bfe_u32 v52, v55, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v54, 0x40c00000, v64
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v28
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v10, 0x40c00000, v10 :: v_dual_add_f32 v11, 0x40c00000, v11
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v50, v51, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v50, v52, v55, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v55
-; GFX11-TRUE16-NEXT: v_bfe_u32 v52, v53, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v55, 0x40c00000, v64
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v27.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v26
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v28, v50, v51 :: v_dual_and_b32 v13, 0xffff0000, v13
-; GFX11-TRUE16-NEXT: v_add3_u32 v50, v52, v53, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v53
-; GFX11-TRUE16-NEXT: v_bfe_u32 v52, v54, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
-; GFX11-TRUE16-NEXT: v_bfe_u32 v53, v55, 16, 1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v33, 16, v26
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v37.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v50, v50, v51, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v51, v52, v54, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v54
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 16, v29
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v52, 0x40c00000, v64
+; GFX11-TRUE16-NEXT: v_add3_u32 v53, v53, v27, 0x7fff
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v51, v54, v55 :: v_dual_lshlrev_b32 v54, 16, v29
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v55, 0x400000, v27
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v51, v52, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v51, v53, v55, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v55
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v54, 0x40c00000, v65 :: v_dual_lshlrev_b32 v65, 16, v30
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v64, 0x40c00000, v64
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v50.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v51, v51, v52, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v55, v54, 16, 1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v65, 0x40c00000, v65
-; GFX11-TRUE16-NEXT: v_bfe_u32 v53, v64, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v28
-; GFX11-TRUE16-NEXT: v_add3_u32 v52, v53, v64, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v53, 0x400000, v64
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v64, 0x40c00000, v66
+; GFX11-TRUE16-NEXT: v_bfe_u32 v64, v52, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v54, 0x40c00000, v54
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v48.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v53, v55, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v53, v64, v52, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v55, 0x400000, v52
+; GFX11-TRUE16-NEXT: v_bfe_u32 v64, v28, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11-TRUE16-NEXT: v_bfe_u32 v65, v54, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v51.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v52, v53, v55, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v53, v64, v28, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v55, 0x400000, v28
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-TRUE16-NEXT: v_add3_u32 v64, v65, v54, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v65, 0x400000, v54
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v53, v55, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v55, v29, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v54, 0x40c00000, v66
+; GFX11-TRUE16-NEXT: v_add3_u32 v55, v55, v29, 0x7fff
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 16, v31
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v30, v52, v53 :: v_dual_and_b32 v31, 0xffff0000, v31
-; GFX11-TRUE16-NEXT: v_add3_u32 v52, v55, v54, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v53, 0x400000, v54
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v53, v64, v65 :: v_dual_lshlrev_b32 v64, 16, v31
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v65, 0x400000, v29
+; GFX11-TRUE16-NEXT: v_bfe_u32 v66, v54, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v64, 0x40c00000, v64 :: v_dual_cndmask_b32 v29, v55, v65
+; GFX11-TRUE16-NEXT: v_add3_u32 v55, v66, v54, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v65, 0x400000, v54
+; GFX11-TRUE16-NEXT: v_bfe_u32 v66, v30, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54
-; GFX11-TRUE16-NEXT: v_bfe_u32 v55, v65, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v54, 0x400000, v65
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v66, 0x40c00000, v66 :: v_dual_add_f32 v31, 0x40c00000, v31
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v52, v52, v53, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add3_u32 v53, v55, v65, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v55, v64, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65
-; GFX11-TRUE16-NEXT: v_bfe_u32 v65, v66, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v53, v53, v54, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add3_u32 v54, v55, v64, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v55, 0x400000, v64
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v64, 0x400000, v66
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v53
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v54, v54, v55, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v55, v65, v66, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v67, v64, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v53.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v54, v55, v65, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v55, v66, v30, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v65, 0x400000, v30
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-TRUE16-NEXT: v_add3_u32 v66, v67, v64, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v67, 0x400000, v64
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v55, v65, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v65, v31, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v67, 0x40c00000, v67 :: v_dual_add_f32 v66, 0x40c00000, v68
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v68, 16, v1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v55, v64, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v55, v65, v31, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v64, 0x400000, v31
-; GFX11-TRUE16-NEXT: v_bfe_u32 v65, v67, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v64, 0x40c00000, v68
+; GFX11-TRUE16-NEXT: v_add3_u32 v65, v65, v31, 0x7fff
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v55, v66, v67 :: v_dual_lshlrev_b32 v66, 16, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v67, 0x400000, v31
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v68, v64, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v55, v64, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v55, v65, v67, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v64, 0x400000, v67
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v67, 0x40c00000, v69
-; GFX11-TRUE16-NEXT: v_bfe_u32 v65, v66, 16, 1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 16, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v31.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v55, v64, vcc_lo
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v64, 0x400000, v66
-; GFX11-TRUE16-NEXT: v_add3_u32 v55, v65, v66, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v66, 0x40c00000, v66 :: v_dual_cndmask_b32 v31, v65, v67
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add3_u32 v65, v68, v64, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v67, 0x400000, v64
+; GFX11-TRUE16-NEXT: v_bfe_u32 v68, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX11-TRUE16-NEXT: v_bfe_u32 v69, v66, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v55.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v64, v65, v67, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v65, v68, v0, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v67, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add3_u32 v68, v69, v66, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, 0x400000, v66
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v65, v67, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v65, v1, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66
-; GFX11-TRUE16-NEXT: v_bfe_u32 v66, v67, 16, 1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v69, 0x40c00000, v69
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v55, v55, v64, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v67, 0x40c00000, v70
+; GFX11-TRUE16-NEXT: v_add3_u32 v65, v65, v1, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v66, v68, v69, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v68, 16, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v70, v67, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX11-TRUE16-NEXT: v_add_f32_e32 v68, 0x40c00000, v68
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v55.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v65, v68, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v68, v68
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v1
-; GFX11-TRUE16-NEXT: v_add3_u32 v64, v65, v68, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v65, 0x400000, v68
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v68, 0x40c00000, v70
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 16, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v64, v65 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-TRUE16-NEXT: v_add3_u32 v64, v66, v67, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v65, 0x400000, v67
-; GFX11-TRUE16-NEXT: v_bfe_u32 v66, v69, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v50.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v65, v69, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v65, v70, v67, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, 0x400000, v67
+; GFX11-TRUE16-NEXT: v_bfe_u32 v70, v2, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67
-; GFX11-TRUE16-NEXT: v_bfe_u32 v67, v68, 16, 1
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v70, 0x40c00000, v70 :: v_dual_add_f32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v64, v64, v65, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v65, v66, v69, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, 0x400000, v69
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v69, v69
-; GFX11-TRUE16-NEXT: v_bfe_u32 v69, v70, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v65, v65, v66, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v66, v67, v68, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v67, 0x400000, v68
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v68, v68
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v68, 0x400000, v70
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v65
+; GFX11-TRUE16-NEXT: v_bfe_u32 v71, v68, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add3_u32 v67, v70, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v65, v65, v69, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v70, v71, v68, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v71, 0x400000, v68
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v66, v66, v67, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v67, v69, v70, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v69, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v70, v70
-; GFX11-TRUE16-NEXT: v_bfe_u32 v70, v71, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v67, v67, v68, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add3_u32 v68, v69, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v67, v69, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v67, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v68, v68
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v69, 0x40c00000, v80
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v65.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v67, v67, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v68, v70, v71, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 16, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v71, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v80, v69, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v67
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v68, v69, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v68, v70, v71, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, 0x400000, v71
-; GFX11-TRUE16-NEXT: v_bfe_u32 v70, v4, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v71, v71
-; GFX11-TRUE16-NEXT: v_bfe_u32 v71, v80, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v68, v68, v69, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v69, v70, v4, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v70, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v70, 0x40c00000, v70
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v52.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v67, v71, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v67, v80, v69, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v71, 0x400000, v69
+; GFX11-TRUE16-NEXT: v_bfe_u32 v80, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v69, v69
+; GFX11-TRUE16-NEXT: v_bfe_u32 v81, v70, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v68.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v69, v80, v4, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v67, v67, v71, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v71, 0x400000, v4
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v68
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v69, v70, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v69, v71, v80, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v70, 0x400000, v80
-; GFX11-TRUE16-NEXT: v_bfe_u32 v71, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v80, v80
-; GFX11-TRUE16-NEXT: v_bfe_u32 v80, v81, 16, 1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v69, v69, v70, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v70, v71, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v71, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v80, v81, v70, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v81, 0x400000, v70
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v69, v71, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v69, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v70, v70
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v71, 0x40c00000, v82
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v67.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v69, v69, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v70, v80, v81, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v80, 16, v7
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v81, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v82, v71, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v69
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v70, v71, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v70, v80, v81, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v71, 0x400000, v81
-; GFX11-TRUE16-NEXT: v_bfe_u32 v80, v6, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v81, v81
-; GFX11-TRUE16-NEXT: v_bfe_u32 v81, v82, 16, 1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v5.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v70, v70, v71, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v71, v80, v6, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v80, 0x400000, v6
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v71, v80, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v71, v81, v82, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v80, 0x400000, v82
-; GFX11-TRUE16-NEXT: v_bfe_u32 v81, v7, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82
-; GFX11-TRUE16-NEXT: v_bfe_u32 v82, v83, 16, 1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v71, v71, v80, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v80, v81, v7, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v81, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v80, 0x40c00000, v80 :: v_dual_cndmask_b32 v5, v69, v81
+; GFX11-TRUE16-NEXT: v_add3_u32 v69, v82, v71, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v81, 0x400000, v71
+; GFX11-TRUE16-NEXT: v_bfe_u32 v82, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v71, v71
+; GFX11-TRUE16-NEXT: v_bfe_u32 v83, v80, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v70.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v71, v82, v6, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v69, v69, v81, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v81, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v82, v83, v80, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v83, 0x400000, v80
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v71, v81, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v71, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v80, v80
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v81, 0x40c00000, v84
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v69.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v71, v71, v7, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v80, v82, v83, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v83, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v84, v81, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v80, v81, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v80, v82, v83, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v81, 0x400000, v83
-; GFX11-TRUE16-NEXT: v_bfe_u32 v82, v8, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v83, v83
-; GFX11-TRUE16-NEXT: v_bfe_u32 v83, v84, 16, 1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v80, v80, v81, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v81, v82, v8, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v82, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v82, 0x40c00000, v82 :: v_dual_cndmask_b32 v7, v71, v83
+; GFX11-TRUE16-NEXT: v_add3_u32 v71, v84, v81, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v83, 0x400000, v81
+; GFX11-TRUE16-NEXT: v_bfe_u32 v84, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v81, v81
+; GFX11-TRUE16-NEXT: v_bfe_u32 v85, v82, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v80.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v81, v84, v8, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v71, v71, v83, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v83, 0x400000, v8
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v81, v82, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v81, v83, v84, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v82, 0x400000, v84
-; GFX11-TRUE16-NEXT: v_bfe_u32 v83, v9, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v84, v84
-; GFX11-TRUE16-NEXT: v_bfe_u32 v84, v85, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v81, v81, v82, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v82, v83, v9, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v83, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_add3_u32 v84, v85, v82, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v85, 0x400000, v82
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v81, v83, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v81, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v83, 0x40c00000, v86
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v71.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v81, v81, v9, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v82, v84, v85, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 16, v11
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v85, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v86, v83, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v82, v83, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v82, v84, v85, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v83, 0x400000, v85
-; GFX11-TRUE16-NEXT: v_bfe_u32 v84, v10, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v85, v85
-; GFX11-TRUE16-NEXT: v_bfe_u32 v85, v86, 16, 1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v9.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v81
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v82, v82, v83, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v83, v84, v10, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v84, 0x400000, v10
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v84, 0x40c00000, v84
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v54.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v81, v85, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v81, v86, v83, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v85, 0x400000, v83
+; GFX11-TRUE16-NEXT: v_bfe_u32 v86, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v83, v83
+; GFX11-TRUE16-NEXT: v_bfe_u32 v87, v84, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v82.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v83, v86, v10, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v81, v81, v85, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v85, 0x400000, v10
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v7, 16, v9
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v83, v84, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v83, v85, v86, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v84, 0x400000, v86
-; GFX11-TRUE16-NEXT: v_bfe_u32 v85, v11, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v86, 0x40c00000, v96
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v96, 0x400000, v11
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h
-; GFX11-TRUE16-NEXT: v_add3_u32 v85, v85, v11, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v83, v83, v84, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfe_u32 v84, v87, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v86, v87, v84, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v87, 0x400000, v84
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v83, v85, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v83, v11, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v84, v84
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v85, 0x40c00000, v96
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v81.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v83, v83, v11, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v84, v86, v87, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v86, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v13
+; GFX11-TRUE16-NEXT: v_bfe_u32 v96, v85, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-TRUE16-NEXT: v_bfe_u32 v99, v86, 16, 1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v82
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add3_u32 v84, v84, v87, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v85, v96, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v87, v87
-; GFX11-TRUE16-NEXT: v_add3_u32 v87, v99, v86, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v96, 0x400000, v86
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v6, 16, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v11.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v84, v84, v97, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v83
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v86, v87, v96, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 16, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v64.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v83, v86, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v83, 0x40c00000, v87
+; GFX11-TRUE16-NEXT: v_add3_u32 v86, v96, v85, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v87, 0x400000, v85
+; GFX11-TRUE16-NEXT: v_bfe_u32 v96, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v85, v85
+; GFX11-TRUE16-NEXT: v_bfe_u32 v97, v83, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v98, 0x400000, v83
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v84.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v85, v86, v87, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v86, v96, v12, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v87, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT: v_add3_u32 v96, v97, v83, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 16, v14
+; GFX11-TRUE16-NEXT: v_bfe_u32 v99, v13, 16, 1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v86, v87, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v83, v83
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v86, 0x40c00000, v97
+; GFX11-TRUE16-NEXT: v_add3_u32 v87, v99, v13, 0x7fff
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v85.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v83, v96, v98, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v96, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v98, 16, v15
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT: v_bfe_u32 v97, v86, 16, 1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v5, 16, v11
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v70
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v96, 0x40c00000, v96 :: v_dual_add_f32 v15, 0x40c00000, v15
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v5, 16, v6
-; GFX11-TRUE16-NEXT: v_bfe_u32 v101, v96, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v102, v15, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v113, 0x400000, v15
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v114, 0x400000, v96
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v17, 16, v69
-; GFX11-TRUE16-NEXT: v_add3_u32 v101, v101, v96, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v102, v102, v15, 0x7fff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v66.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v27, 16, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v51.h
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v29
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v66
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v98, v12, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v97, 0x400000, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v27, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v38.h
-; GFX11-TRUE16-NEXT: v_add3_u32 v85, v98, v12, 0x7fff
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v98, 16, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v24
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v100, 0x400000, v14
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v66.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v87, v96, vcc_lo
; GFX11-TRUE16-NEXT: v_add_f32_e32 v87, 0x40c00000, v98
-; GFX11-TRUE16-NEXT: v_bfe_u32 v98, v13, 16, 1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v34.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20
+; GFX11-TRUE16-NEXT: v_add3_u32 v96, v97, v86, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v97, 0x400000, v86
+; GFX11-TRUE16-NEXT: v_bfe_u32 v98, v14, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86
; GFX11-TRUE16-NEXT: v_bfe_u32 v99, v87, 16, 1
-; GFX11-TRUE16-NEXT: v_add3_u32 v98, v98, v13, 0x7fff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v103, 0x400000, v87
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18
-; GFX11-TRUE16-NEXT: v_add3_u32 v99, v99, v87, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; GFX11-TRUE16-NEXT: v_bfe_u32 v100, v14, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v112, 0x400000, v14
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v83.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v86, v96, v97, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v97, v98, v14, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v98, v99, v87, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v99, 0x400000, v87
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v87, v87
+; GFX11-TRUE16-NEXT: v_bfe_u32 v96, v15, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v101, 0x400000, v15
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v87, v98, v99, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v100, v100, v14, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v100, v112, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v96, v96, v15, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v97, v100, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v14.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v102, v113, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v96, v96
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v96, v101, v114, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v87, v87
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v96
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v87, v99, v103, vcc_lo
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v99, 0x400000, v13
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v3, 16, v15
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v87
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v98, v99, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v4, 16, v14
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v13.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v85, v97, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v86
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v12.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v84
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v3, 16, v13
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v80
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v4, 16, v12
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v7.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v71
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v3, 16, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v4, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v22, 16, v68
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v64.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v17, 16, v65
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v54.h
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v3, 16, v67
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v22, 16, v64
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v52.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v30
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v17, 16, v53
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v49.h
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v22, 16, v52
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v25
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v86.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v17, 16, v48
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v36.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v22, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v33, 16, v37
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v17, 16, v35
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v33, 16, v36
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v37, 16, v38
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v39, 16, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v96, v101, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v87.h
; GFX11-TRUE16-NEXT: .LBB104_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@@ -231617,641 +236493,1242 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v64bf16_to_v64i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12
-; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
-; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8
-; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6
-; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
-; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
-; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
-; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: s_mov_b32 s15, s3
-; GFX11-NEXT: s_mov_b32 s14, s2
-; GFX11-NEXT: s_mov_b32 s13, s1
-; GFX11-NEXT: s_mov_b32 s12, s0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB105_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB105_4
-; GFX11-NEXT: .LBB105_2: ; %cmp.true
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v17
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v16
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v18
-; GFX11-NEXT: s_and_b32 s0, s12, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s1, s24, 16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v17
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v2
-; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v0
-; GFX11-NEXT: v_add_f32_e32 v0, 0x40c00000, v4
-; GFX11-NEXT: v_bfe_u32 v11, v3, 16, 1
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v16, v5, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v11, v3
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v17, v6, v9, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v19
-; GFX11-NEXT: v_cndmask_b32_e32 v32, v7, v10, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_dual_add_f32 v3, 0x40c00000, v6 :: v_dual_add_nc_u32 v2, v2, v0
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v18, v1, v5 :: v_dual_lshlrev_b32 v5, 16, v19
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_add_f32 v4, 0x40c00000, v4
-; GFX11-NEXT: v_dual_cndmask_b32 v33, v2, v6 :: v_dual_add_nc_u32 v2, v7, v3
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v20
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v1, v4, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v1
-; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v19, v0, v1 :: v_dual_add_nc_u32 v0, 0x7fff, v2
-; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v7
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v34, v0, v1, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v21
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v20
-; GFX11-NEXT: v_bfe_u32 v0, v2, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_add_f32 v4, 0x40c00000, v6
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v21
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v20, v1, v3, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_bfe_u32 v1, v4, 16, 1
-; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v4
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v35, v0, v6, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v1
-; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v7, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v22
-; GFX11-NEXT: v_cndmask_b32_e32 v36, v0, v1, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v2
-; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v21, v0, v1, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v22
-; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v6
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v23
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v23
-; GFX11-NEXT: v_cndmask_b32_e32 v22, v1, v3, vcc_lo
-; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v7
-; GFX11-NEXT: v_bfe_u32 v1, v4, 16, 1
-; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v0, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v7, v3
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v24
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v37, v0, v6 :: v_dual_add_nc_u32 v0, 0x7fff, v1
-; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_cndmask_b32 v23, v0, v1 :: v_dual_add_nc_u32 v0, 0x7fff, v2
-; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v38, v0, v1, vcc_lo
-; GFX11-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v24
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v25
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
-; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v6
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v25
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_cndmask_b32 v24, v1, v3
-; GFX11-NEXT: v_dual_add_f32 v2, 0x40c00000, v7 :: v_dual_add_f32 v3, 0x40c00000, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v1, v4, 16, 1
-; GFX11-NEXT: v_bfe_u32 v0, v2, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v7, v3
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v26
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_cndmask_b32 v39, v0, v6 :: v_dual_add_nc_u32 v0, 0x7fff, v1
-; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v25, v0, v1 :: v_dual_add_nc_u32 v0, 0x7fff, v2
-; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v26
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v48, v0, v1, vcc_lo
-; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
-; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v6
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v27
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_bfe_u32 v0, v2, 16, 1
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v27
-; GFX11-NEXT: v_cndmask_b32_e32 v49, v1, v3, vcc_lo
-; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v2
-; GFX11-NEXT: v_bfe_u32 v1, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX11-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v49
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v7, v3
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v28
-; GFX11-NEXT: v_dual_cndmask_b32 v26, v0, v6 :: v_dual_add_nc_u32 v1, v1, v4
-; GFX11-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v1
-; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v5
-; GFX11-NEXT: v_dual_cndmask_b32 v27, v0, v1 :: v_dual_lshlrev_b32 v6, 16, v28
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v2
-; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v50, v0, v1, vcc_lo
-; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
-; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v6
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v29
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_bfe_u32 v0, v2, 16, 1
-; GFX11-NEXT: v_dual_cndmask_b32 v28, v1, v3 :: v_dual_lshlrev_b32 v5, 16, v29
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v2
-; GFX11-NEXT: v_bfe_u32 v1, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX11-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v7, v3
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v30
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v51, v0, v6, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v1
-; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_dual_cndmask_b32 v29, v0, v1 :: v_dual_lshlrev_b32 v6, 16, v30
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v2
-; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v3
-; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v52, v0, v1 :: v_dual_add_nc_u32 v1, 0x7fff, v4
-; GFX11-NEXT: v_bfe_u32 v0, v2, 16, 1
-; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v6
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v31
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_cndmask_b32 v30, v1, v3 :: v_dual_add_f32 v3, 0x40c00000, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v31
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_dual_cndmask_b32 v53, v0, v5 :: v_dual_add_f32 v0, 0x40c00000, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v3
-; GFX11-NEXT: v_bfe_u32 v1, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v4
-; GFX11-NEXT: s_lshl_b32 s0, s12, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_dual_cndmask_b32 v54, v1, v2 :: v_dual_add_nc_u32 v1, 0x7fff, v5
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v5, v7, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v54
-; GFX11-NEXT: v_cndmask_b32_e32 v31, v1, v2, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v5, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_and_b32 s0, s13, 0xffff0000
-; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s13, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v5, v1
-; GFX11-NEXT: v_bfe_u32 v5, v6, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v55, v2, v3, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s14, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v6
-; GFX11-NEXT: s_lshl_b32 s0, s14, 16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v4 :: v_dual_add_nc_u32 v1, 0x7fff, v5
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v3
-; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v8
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: s_and_b32 s0, s15, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v9, v4, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v7
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v8
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: s_lshl_b32 s0, s15, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v9, v4
-; GFX11-NEXT: v_bfe_u32 v9, v10, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v64, v5, v6, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s16, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v9, v10
-; GFX11-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: s_lshl_b32 s0, s16, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v8, v11, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v65, v6, v7, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v9, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v65
-; GFX11-NEXT: v_cndmask_b32_e32 v66, v4, v6, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_and_b32 s0, s17, 0xffff0000
-; GFX11-NEXT: v_bfe_u32 v9, v4, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v8
-; GFX11-NEXT: v_cndmask_b32_e32 v67, v6, v7, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v11
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: s_lshl_b32 s0, s17, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v9, v4
-; GFX11-NEXT: v_bfe_u32 v8, v10, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v67
-; GFX11-NEXT: v_cndmask_b32_e32 v68, v5, v6, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s18, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v10
-; GFX11-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: s_lshl_b32 s0, s18, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v9, v5
-; GFX11-NEXT: v_bfe_u32 v9, v11, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v11
-; GFX11-NEXT: v_cndmask_b32_e32 v69, v6, v7, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_and_b32 s0, s19, 0xffff0000
-; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s19, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v9
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v11
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v9, v10, v6
-; GFX11-NEXT: v_bfe_u32 v10, v12, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v70, v7, v8, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s20, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v9
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v12
-; GFX11-NEXT: v_bfe_u32 v11, v7, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: s_lshl_b32 s0, s20, 16
-; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v7
-; GFX11-NEXT: v_and_or_b32 v5, 0xffff0000, v69, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v9, v11, v7
-; GFX11-NEXT: v_bfe_u32 v10, v13, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: s_and_b32 s0, s21, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v13
-; GFX11-NEXT: v_bfe_u32 v71, v11, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v80, v8, v14, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v10
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v13
-; GFX11-NEXT: s_lshl_b32 s0, s21, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v9, v15, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v9, v71, v11
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v11
-; GFX11-NEXT: v_bfe_u32 v71, v12, 16, 1
-; GFX11-NEXT: v_dual_cndmask_b32 v8, v8, v10 :: v_dual_add_nc_u32 v9, 0x7fff, v9
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: s_and_b32 s0, s22, 0xffff0000
-; GFX11-NEXT: v_bfe_u32 v10, v14, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s22, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v81, v9, v15, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v9, v71, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v14
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v12
-; GFX11-NEXT: v_bfe_u32 v71, v11, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT: v_or_b32_e32 v82, 0x400000, v14
-; GFX11-NEXT: v_bfe_u32 v83, v13, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v71, v11
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v15, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT: s_and_b32 s0, s23, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v14, v83, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
-; GFX11-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v82, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v71, 0x400000, v11
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: s_lshl_b32 s0, s23, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
-; GFX11-NEXT: v_or_b32_e32 v82, 0x400000, v13
-; GFX11-NEXT: v_bfe_u32 v83, v15, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v84, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v71, v12, v71, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: s_and_b32 s0, s24, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v83, v15
-; GFX11-NEXT: v_bfe_u32 v13, v84, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v83, 0x400000, v15
-; GFX11-NEXT: v_cndmask_b32_e32 v11, v14, v82, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
-; GFX11-NEXT: v_add_f32_e64 v82, 0x40c00000, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v13, v13, v84
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-NEXT: v_bfe_u32 v85, v14, 16, 1
-; GFX11-NEXT: v_bfe_u32 v86, v82, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s25, 0xffff0000
-; GFX11-NEXT: v_or_b32_e32 v96, 0x400000, v82
-; GFX11-NEXT: v_dual_cndmask_b32 v83, v12, v83 :: v_dual_add_nc_u32 v12, 0x7fff, v13
-; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v84
-; GFX11-NEXT: v_add_nc_u32_e32 v15, v85, v14
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v84, v84
-; GFX11-NEXT: v_add_nc_u32_e32 v85, v86, v82
-; GFX11-NEXT: v_or_b32_e32 v84, 0x400000, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX11-NEXT: v_dual_cndmask_b32 v12, v12, v13 :: v_dual_add_nc_u32 v15, 0x7fff, v15
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v85, 0x7fff, v85
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s25, 16
-; GFX11-NEXT: v_and_or_b32 v6, 0xffff0000, v70, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v84, v15, v84, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82
-; GFX11-NEXT: v_add_f32_e64 v87, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s26, 0xffff0000
-; GFX11-NEXT: v_bfe_u32 v86, v13, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v82, v85, v96, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v85, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s26, 16
-; GFX11-NEXT: v_bfe_u32 v15, v87, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v96, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s27, 16
-; GFX11-NEXT: v_bfe_u32 v97, v85, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v98, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s27, 0xffff0000
-; GFX11-NEXT: v_bfe_u32 v99, v96, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v100, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v113, 0x400000, v96
-; GFX11-NEXT: v_bfe_u32 v101, v98, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v96, v96
-; GFX11-NEXT: v_add_nc_u32_e32 v99, v99, v96
-; GFX11-NEXT: v_add_nc_u32_e32 v97, v97, v85
-; GFX11-NEXT: v_bfe_u32 v103, v100, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v101, v101, v98
-; GFX11-NEXT: v_or_b32_e32 v114, 0x400000, v98
-; GFX11-NEXT: v_add_nc_u32_e32 v99, 0x7fff, v99
-; GFX11-NEXT: v_add_nc_u32_e32 v97, 0x7fff, v97
-; GFX11-NEXT: v_or_b32_e32 v112, 0x400000, v85
-; GFX11-NEXT: v_add_nc_u32_e32 v101, 0x7fff, v101
-; GFX11-NEXT: v_add_nc_u32_e32 v103, v103, v100
-; GFX11-NEXT: v_cndmask_b32_e32 v96, v99, v113, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v98, v98
-; GFX11-NEXT: v_add_nc_u32_e32 v15, v15, v87
-; GFX11-NEXT: v_add_nc_u32_e32 v14, v86, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v99, 0x7fff, v103
-; GFX11-NEXT: v_or_b32_e32 v103, 0x400000, v100
-; GFX11-NEXT: v_cndmask_b32_e32 v98, v101, v114, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v85, v85
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
-; GFX11-NEXT: v_or_b32_e32 v102, 0x400000, v87
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
-; GFX11-NEXT: v_or_b32_e32 v86, 0x400000, v13
-; GFX11-NEXT: v_cndmask_b32_e32 v85, v97, v112, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v100, v100
-; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v96
-; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v64, v65
-; GFX11-NEXT: v_and_or_b32 v0, 0xffff0000, v55, v69
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v30
-; GFX11-NEXT: v_cndmask_b32_e32 v97, v99, v103, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v87, v87
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v28
-; GFX11-NEXT: v_and_or_b32 v4, 0xffff0000, v68, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v87, v15, v102, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v66, v67
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v27
-; GFX11-NEXT: v_and_or_b32 v29, 0xffff0000, v52, v55
-; GFX11-NEXT: v_and_or_b32 v28, 0xffff0000, v51, v64
-; GFX11-NEXT: v_cndmask_b32_e32 v13, v14, v86, vcc_lo
-; GFX11-NEXT: v_and_or_b32 v14, 0xffff0000, v85, v96
-; GFX11-NEXT: v_lshrrev_b32_e32 v85, 16, v87
-; GFX11-NEXT: v_lshrrev_b32_e32 v87, 16, v11
-; GFX11-NEXT: v_and_or_b32 v27, 0xffff0000, v50, v65
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v98, 16, v98
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v82
-; GFX11-NEXT: v_lshrrev_b32_e32 v86, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v10
-; GFX11-NEXT: v_and_or_b32 v10, 0xffff0000, v71, v87
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v81
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX11-NEXT: v_and_or_b32 v30, 0xffff0000, v53, v54
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v22
-; GFX11-NEXT: v_and_or_b32 v25, 0xffff0000, v48, v49
-; GFX11-NEXT: v_and_or_b32 v24, 0xffff0000, v39, v50
-; GFX11-NEXT: v_and_or_b32 v23, 0xffff0000, v38, v51
-; GFX11-NEXT: v_and_or_b32 v22, 0xffff0000, v37, v52
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v17
-; GFX11-NEXT: v_and_or_b32 v15, 0xffff0000, v97, v98
-; GFX11-NEXT: v_and_or_b32 v13, 0xffff0000, v13, v85
-; GFX11-NEXT: v_and_or_b32 v12, 0xffff0000, v84, v82
-; GFX11-NEXT: v_and_or_b32 v11, 0xffff0000, v83, v86
-; GFX11-NEXT: v_and_or_b32 v9, 0xffff0000, v9, v96
-; GFX11-NEXT: v_and_or_b32 v8, 0xffff0000, v8, v71
-; GFX11-NEXT: v_and_or_b32 v7, 0xffff0000, v80, v7
-; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v1, v68
-; GFX11-NEXT: v_and_or_b32 v31, 0xffff0000, v31, v70
-; GFX11-NEXT: v_and_or_b32 v26, 0xffff0000, v26, v66
-; GFX11-NEXT: v_and_or_b32 v21, 0xffff0000, v21, v53
-; GFX11-NEXT: v_and_or_b32 v20, 0xffff0000, v35, v36
-; GFX11-NEXT: v_and_or_b32 v19, 0xffff0000, v34, v37
-; GFX11-NEXT: v_and_or_b32 v18, 0xffff0000, v33, v38
-; GFX11-NEXT: v_and_or_b32 v17, 0xffff0000, v32, v39
-; GFX11-NEXT: v_and_or_b32 v16, 0xffff0000, v16, v48
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB105_3:
-; GFX11-NEXT: s_branch .LBB105_2
-; GFX11-NEXT: .LBB105_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
-; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
-; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v64i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s13, s1
+; GFX11-TRUE16-NEXT: s_mov_b32 s12, s0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB105_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB105_4
+; GFX11-TRUE16-NEXT: .LBB105_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v16
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v18
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s12, 0xffff0000
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s25, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v3, 16, v17
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v16
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_cndmask_b32 v16, v5, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v11, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v6, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v7, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v0, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v19
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v18, v5, v7 :: v_dual_and_b32 v7, 0xffff0000, v20
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v20
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v33.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v7 :: v_dual_add_nc_u32 v0, v5, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v34.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v19, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v21
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v6, 16, v21
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v35.h
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v4, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v32.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v23
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v36.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v21, v4, v8, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v6, 0x40c00000, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v5, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v23
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_cndmask_b32 v37, v0, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v37.h
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v22
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v22, v4, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v24
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v24
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v38, v0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v7 :: v_dual_add_nc_u32 v0, v5, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v4, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v25
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v38.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v39, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v24, v4, v7 :: v_dual_and_b32 v7, 0xffff0000, v26
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v26
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v48, v0, v1 :: v_dual_add_f32 v1, 0x40c00000, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v48.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v25
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v27
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v6, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v25, v4, v8 :: v_dual_add_nc_u32 v0, v5, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v27
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_cndmask_b32 v49, v0, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v49.h
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v26, v4, v7 :: v_dual_and_b32 v7, 0xffff0000, v28
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v28
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v50, v0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v7 :: v_dual_add_nc_u32 v0, v5, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v29
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v4, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v50.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v51, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v51.h
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v30
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v7, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v52, v0, v4, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v52.h
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v29
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v30
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v31
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s12, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v53, v2, v6 :: v_dual_lshlrev_b32 v2, 16, v31
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v53.h
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v4, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v54, v0, v4 :: v_dual_add_nc_u32 v1, v1, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s13, 0xffff0000
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s13, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v8
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v64, v0, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v0, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s14, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v3 :: v_dual_add_nc_u32 v1, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s14, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v55, v1, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s15, 0xffff0000
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v64.h
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v2, v8 :: v_dual_add_nc_u32 v2, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s15, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v65, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v2, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v6
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s16, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v3, v7 :: v_dual_add_nc_u32 v3, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v10
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s16, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v66, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v11, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v54.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v66.h
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v4, v9 :: v_dual_add_nc_u32 v4, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s17, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v11
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s17, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v67, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v4, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v8
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s18, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v5, v9 :: v_dual_add_nc_u32 v5, 0x7fff, v10
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s18, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v9, v12
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v13, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v68, v5, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v67.h
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v13
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v68.h
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v6, v11 :: v_dual_add_nc_u32 v6, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s19, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s19, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v69, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v6, v9
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s20, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v10
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v6, v7, v11 :: v_dual_add_nc_u32 v7, 0x7fff, v12
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v14, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v14
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s20, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v70, v7, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v15, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v14
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v69.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v70.h
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v7, v8, v13 :: v_dual_add_nc_u32 v8, 0x7fff, v11
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s21, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v9, v15
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s21, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v71, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v11, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v15
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v80, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v8, v11
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v10, v12
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s22, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v81, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v8, v9, v13 :: v_dual_add_nc_u32 v9, 0x7fff, v14
+; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v80, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s22, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, v13, v80
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v81, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v82, v9, v14, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v80
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v71.h
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v81
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v82.h
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v9, v10, v15 :: v_dual_add_nc_u32 v10, 0x7fff, v13
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v80, v80
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s23, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s23, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v80, v10, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v13, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v81
+; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v14, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v84, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v81, v81
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v83, v10, v13
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s24, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v85, 0x400000, v14
+; GFX11-TRUE16-NEXT: v_bfe_u32 v81, v84, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v15, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v83
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v83, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v81, v81, v84
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s24, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v86, v12, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v83, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v81
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v84
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v85, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v84, v84
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v12, v83
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v81, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s25, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v85, 0x400000, v83
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v84, v13, v14, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v15, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v81, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v87, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v83, v83
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, v13, v15
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v96, 0x400000, v81
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v14, v81
+; GFX11-TRUE16-NEXT: v_bfe_u32 v83, v87, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v12, v85, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v85, 0x400000, v15
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v83, v83, v87
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s26, 0xffff0000
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v84.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v85, v13, v85, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v81, v81
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s26, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v86.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v85.h
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v13, v14, v96 :: v_dual_add_nc_u32 v14, 0x7fff, v83
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v83, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s27, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v96, 0x400000, v87
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v97, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v87, v87
+; GFX11-TRUE16-NEXT: v_bfe_u32 v98, v83, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s27, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v81, v15, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v100, v97, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v87, v14, v96, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v98, v83
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v99, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v101, 0x400000, v83
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v98, v100, v97
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v83, v83
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v81, v81, v15
+; GFX11-TRUE16-NEXT: v_bfe_u32 v96, v99, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v98, 0x7fff, v98
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v102, 0x400000, v97
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v83, v14, v101, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v97, v97
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v81, 0x7fff, v81
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v100, 0x400000, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v96, v96, v99
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v101, 0x400000, v99
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v97, v98, v102, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v87.h
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v96, 0x7fff, v96
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v80.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v65.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v81, v100, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v99, v99
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v83.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v55.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v96, v101, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v97.h
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB105_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB105_2
+; GFX11-TRUE16-NEXT: .LBB105_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v64i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s15, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s14, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s13, s1
+; GFX11-FAKE16-NEXT: s_mov_b32 s12, s0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB105_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB105_4
+; GFX11-FAKE16-NEXT: .LBB105_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v17
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v18
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s12, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s24, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v17
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v6, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v5, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v11, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v6, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v19
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v7, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v6 :: v_dual_add_nc_u32 v2, v2, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v1, v5 :: v_dual_lshlrev_b32 v5, 16, v19
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_add_f32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v2, v6 :: v_dual_add_nc_u32 v2, v7, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v20
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v19, v0, v1 :: v_dual_add_nc_u32 v0, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v21
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v20
+; GFX11-FAKE16-NEXT: v_bfe_u32 v0, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_add_f32 v4, 0x40c00000, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v21
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v0, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v1, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v0, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v7, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v6, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v22
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v22
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v23
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v23
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v1, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v0, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v0, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v7, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v24
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v37, v0, v6 :: v_dual_add_nc_u32 v0, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v23, v0, v1 :: v_dual_add_nc_u32 v0, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v38, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v6, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v24
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v25
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v25
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_cndmask_b32 v24, v1, v3
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v7 :: v_dual_add_f32 v3, 0x40c00000, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v0, v2, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v0, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v7, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v26
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v39, v0, v6 :: v_dual_add_nc_u32 v0, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v25, v0, v1 :: v_dual_add_nc_u32 v0, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v26
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v27
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v0, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v27
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v1, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v0, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v49
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v7, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v28
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v26, v0, v6 :: v_dual_add_nc_u32 v1, v1, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, 0x400000, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v6, v5
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v27, v0, v1 :: v_dual_lshlrev_b32 v6, 16, v28
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v29
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v0, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v28, v1, v3 :: v_dual_lshlrev_b32 v5, 16, v29
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v0, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v7, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v30
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v51, v0, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v29, v0, v1 :: v_dual_lshlrev_b32 v6, 16, v30
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v52, v0, v1 :: v_dual_add_nc_u32 v1, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v0, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v31
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v0, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v30, v1, v3 :: v_dual_add_f32 v3, 0x40c00000, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v31
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v53, v0, v5 :: v_dual_add_f32 v0, 0x40c00000, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s12, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v6, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v54, v1, v2 :: v_dual_add_nc_u32 v1, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v54
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v1, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v5, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s13, 0xffff0000
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s13, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v5, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v6, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v55, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s14, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v6
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s14, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v4 :: v_dual_add_nc_u32 v1, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s15, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s15, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v9, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v64, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v9, v10
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s16, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v65, v6, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v9, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v65
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v66, v4, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s17, 0xffff0000
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v67, v6, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s17, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v9, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v67
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v68, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s18, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v10
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s18, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v9, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v69, v6, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s19, 0xffff0000
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s19, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v9
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v10, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v12, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v70, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s20, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v9
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v10, v12
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s20, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, 0xffff0000, v69, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v11, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s21, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v10, v13
+; GFX11-FAKE16-NEXT: v_bfe_u32 v71, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v80, v8, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v13
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s21, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v9, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v71, v11
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_bfe_u32 v71, v12, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v8, v10 :: v_dual_add_nc_u32 v9, 0x7fff, v9
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s22, 0xffff0000
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v14, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s22, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v81, v9, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v71, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v10, v14
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_bfe_u32 v71, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v82, 0x400000, v14
+; GFX11-FAKE16-NEXT: v_bfe_u32 v83, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v71, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s23, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v83, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v82, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v71, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s23, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v82, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_bfe_u32 v83, v15, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v84, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v71, v12, v71, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v83, v15
+; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v84, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v83, 0x400000, v15
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v14, v82, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v82, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v13, v84
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT: v_bfe_u32 v85, v14, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v86, v82, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s25, 0xffff0000
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v96, 0x400000, v82
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v83, v12, v83 :: v_dual_add_nc_u32 v12, 0x7fff, v13
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v84
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, v85, v14
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v84, v84
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v85, v86, v82
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v84, 0x400000, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v12, v12, v13 :: v_dual_add_nc_u32 v15, 0x7fff, v15
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v85, 0x7fff, v85
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s25, 16
+; GFX11-FAKE16-NEXT: v_and_or_b32 v6, 0xffff0000, v70, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v84, v15, v84, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v87, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s26, 0xffff0000
+; GFX11-FAKE16-NEXT: v_bfe_u32 v86, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v82, v85, v96, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v85, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s26, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v87, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v96, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s27, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v97, v85, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v98, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s27, 0xffff0000
+; GFX11-FAKE16-NEXT: v_bfe_u32 v99, v96, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v100, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v113, 0x400000, v96
+; GFX11-FAKE16-NEXT: v_bfe_u32 v101, v98, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v96, v96
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v99, v99, v96
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v97, v97, v85
+; GFX11-FAKE16-NEXT: v_bfe_u32 v103, v100, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v101, v101, v98
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v114, 0x400000, v98
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v99, 0x7fff, v99
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v97, 0x7fff, v97
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v85
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v101, 0x7fff, v101
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v103, v103, v100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v96, v99, v113, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v98, v98
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, v15, v87
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v86, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v99, 0x7fff, v103
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v103, 0x400000, v100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v98, v101, v114, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v85, v85
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v102, 0x400000, v87
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v86, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v85, v97, v112, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v100, v100
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v96
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v64, v65
+; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xffff0000, v55, v69
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v30
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v97, v99, v103, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v87, v87
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v28
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, 0xffff0000, v68, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v87, v15, v102, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v66, v67
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v27
+; GFX11-FAKE16-NEXT: v_and_or_b32 v29, 0xffff0000, v52, v55
+; GFX11-FAKE16-NEXT: v_and_or_b32 v28, 0xffff0000, v51, v64
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v14, v86, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_or_b32 v14, 0xffff0000, v85, v96
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 16, v87
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v11
+; GFX11-FAKE16-NEXT: v_and_or_b32 v27, 0xffff0000, v50, v65
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 16, v98
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v82
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v10
+; GFX11-FAKE16-NEXT: v_and_or_b32 v10, 0xffff0000, v71, v87
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v81
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT: v_and_or_b32 v30, 0xffff0000, v53, v54
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v22
+; GFX11-FAKE16-NEXT: v_and_or_b32 v25, 0xffff0000, v48, v49
+; GFX11-FAKE16-NEXT: v_and_or_b32 v24, 0xffff0000, v39, v50
+; GFX11-FAKE16-NEXT: v_and_or_b32 v23, 0xffff0000, v38, v51
+; GFX11-FAKE16-NEXT: v_and_or_b32 v22, 0xffff0000, v37, v52
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v17
+; GFX11-FAKE16-NEXT: v_and_or_b32 v15, 0xffff0000, v97, v98
+; GFX11-FAKE16-NEXT: v_and_or_b32 v13, 0xffff0000, v13, v85
+; GFX11-FAKE16-NEXT: v_and_or_b32 v12, 0xffff0000, v84, v82
+; GFX11-FAKE16-NEXT: v_and_or_b32 v11, 0xffff0000, v83, v86
+; GFX11-FAKE16-NEXT: v_and_or_b32 v9, 0xffff0000, v9, v96
+; GFX11-FAKE16-NEXT: v_and_or_b32 v8, 0xffff0000, v8, v71
+; GFX11-FAKE16-NEXT: v_and_or_b32 v7, 0xffff0000, v80, v7
+; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v1, v68
+; GFX11-FAKE16-NEXT: v_and_or_b32 v31, 0xffff0000, v31, v70
+; GFX11-FAKE16-NEXT: v_and_or_b32 v26, 0xffff0000, v26, v66
+; GFX11-FAKE16-NEXT: v_and_or_b32 v21, 0xffff0000, v21, v53
+; GFX11-FAKE16-NEXT: v_and_or_b32 v20, 0xffff0000, v35, v36
+; GFX11-FAKE16-NEXT: v_and_or_b32 v19, 0xffff0000, v34, v37
+; GFX11-FAKE16-NEXT: v_and_or_b32 v18, 0xffff0000, v33, v38
+; GFX11-FAKE16-NEXT: v_and_or_b32 v17, 0xffff0000, v32, v39
+; GFX11-FAKE16-NEXT: v_and_or_b32 v16, 0xffff0000, v16, v48
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB105_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB105_2
+; GFX11-FAKE16-NEXT: .LBB105_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
index 64b5ecc..c6211aa 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
@@ -3090,108 +3090,206 @@ define inreg <4 x i32> @bitcast_v8bf16_to_v4i32_scalar(<8 x bfloat> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v3, s19
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v8bf16_to_v4i32_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s16, 0
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB23_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB23_4
-; GFX11-NEXT: .LBB23_2: ; %cmp.true
-; GFX11-NEXT: s_lshl_b32 s4, s3, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s3
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
-; GFX11-NEXT: s_pack_lh_b32_b16 s4, 0, s2
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v3
-; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s1, s0, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s3
-; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v3, v7, v9 :: v_dual_add_nc_u32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5
-; GFX11-NEXT: v_bfe_u32 v5, v8, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v4
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v11, v9, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v9
-; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v13 :: v_dual_add_nc_u32 v10, v10, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v12, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v3, v1, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshl_or_b32 v0, v7, 16, v8
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB23_3:
-; GFX11-NEXT: s_branch .LBB23_2
-; GFX11-NEXT: .LBB23_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8bf16_to_v4i32_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s16, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB23_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB23_4
+; GFX11-TRUE16-NEXT: .LBB23_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s4, 0, s3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s4, 0, s2
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v4, 16, 1
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v10, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v9, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v10, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v6, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v1, v12, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v10.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v5.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB23_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB23_2
+; GFX11-TRUE16-NEXT: .LBB23_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8bf16_to_v4i32_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s16, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB23_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB23_4
+; GFX11-FAKE16-NEXT: .LBB23_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s3, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s4, 0, s2
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v3
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v7, v9 :: v_dual_add_nc_u32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v11, v9
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v5, v13 :: v_dual_add_nc_u32 v10, v10, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v6, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v1, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v4, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v7, 16, v8
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB23_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB23_2
+; GFX11-FAKE16-NEXT: .LBB23_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -4125,19 +4223,19 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16
@@ -4152,94 +4250,71 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v6.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2
; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v8.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v6.l, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -7554,108 +7629,206 @@ define inreg <4 x float> @bitcast_v8bf16_to_v4f32_scalar(<8 x bfloat> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v3, s19
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v8bf16_to_v4f32_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s16, 0
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB47_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB47_4
-; GFX11-NEXT: .LBB47_2: ; %cmp.true
-; GFX11-NEXT: s_lshl_b32 s4, s3, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s3
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
-; GFX11-NEXT: s_pack_lh_b32_b16 s4, 0, s2
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v3
-; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s1, s0, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s3
-; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v3, v7, v9 :: v_dual_add_nc_u32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5
-; GFX11-NEXT: v_bfe_u32 v5, v8, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v4
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v11, v9, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v9
-; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v13 :: v_dual_add_nc_u32 v10, v10, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v12, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v3, v1, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshl_or_b32 v0, v7, 16, v8
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB47_3:
-; GFX11-NEXT: s_branch .LBB47_2
-; GFX11-NEXT: .LBB47_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8bf16_to_v4f32_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s16, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_4
+; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s4, 0, s3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s4, 0, s2
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v4, 16, 1
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v10, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v9, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v10, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v6, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v1, v12, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v10.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v5.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB47_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB47_2
+; GFX11-TRUE16-NEXT: .LBB47_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8bf16_to_v4f32_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s16, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB47_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB47_4
+; GFX11-FAKE16-NEXT: .LBB47_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s3, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s4, 0, s2
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v3
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v7, v9 :: v_dual_add_nc_u32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v11, v9
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v5, v13 :: v_dual_add_nc_u32 v10, v10, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v6, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v1, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v4, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v7, 16, v8
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB47_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB47_2
+; GFX11-FAKE16-NEXT: .LBB47_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -8614,19 +8787,19 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16
@@ -8641,94 +8814,71 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v6.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2
; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v8.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v6.l, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -11668,108 +11818,206 @@ define inreg <2 x i64> @bitcast_v8bf16_to_v2i64_scalar(<8 x bfloat> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v3, s19
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v8bf16_to_v2i64_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s16, 0
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB67_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB67_4
-; GFX11-NEXT: .LBB67_2: ; %cmp.true
-; GFX11-NEXT: s_lshl_b32 s4, s3, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s3
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
-; GFX11-NEXT: s_pack_lh_b32_b16 s4, 0, s2
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v3
-; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s1, s0, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s3
-; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v3, v7, v9 :: v_dual_add_nc_u32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5
-; GFX11-NEXT: v_bfe_u32 v5, v8, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v4
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v11, v9, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v9
-; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v13 :: v_dual_add_nc_u32 v10, v10, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v12, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v3, v1, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshl_or_b32 v0, v7, 16, v8
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB67_3:
-; GFX11-NEXT: s_branch .LBB67_2
-; GFX11-NEXT: .LBB67_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8bf16_to_v2i64_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s16, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB67_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB67_4
+; GFX11-TRUE16-NEXT: .LBB67_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s4, 0, s3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s4, 0, s2
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v4, 16, 1
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v10, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v9, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v10, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v6, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v1, v12, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v10.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v5.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB67_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB67_2
+; GFX11-TRUE16-NEXT: .LBB67_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8bf16_to_v2i64_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s16, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB67_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB67_4
+; GFX11-FAKE16-NEXT: .LBB67_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s3, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s4, 0, s2
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v3
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v7, v9 :: v_dual_add_nc_u32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v11, v9
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v5, v13 :: v_dual_add_nc_u32 v10, v10, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v6, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v1, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v4, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v7, 16, v8
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB67_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB67_2
+; GFX11-FAKE16-NEXT: .LBB67_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -12703,19 +12951,19 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16
@@ -12730,94 +12978,71 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v6.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2
; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v8.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v6.l, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -15361,108 +15586,206 @@ define inreg <2 x double> @bitcast_v8bf16_to_v2f64_scalar(<8 x bfloat> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v3, s19
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v8bf16_to_v2f64_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s16, 0
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB83_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB83_4
-; GFX11-NEXT: .LBB83_2: ; %cmp.true
-; GFX11-NEXT: s_lshl_b32 s4, s3, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s3
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
-; GFX11-NEXT: s_pack_lh_b32_b16 s4, 0, s2
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v3
-; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s1, s0, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s3
-; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v3, v7, v9 :: v_dual_add_nc_u32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5
-; GFX11-NEXT: v_bfe_u32 v5, v8, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v4
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v11, v9, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v9
-; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v13 :: v_dual_add_nc_u32 v10, v10, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v12, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v3, v1, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshl_or_b32 v0, v7, 16, v8
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB83_3:
-; GFX11-NEXT: s_branch .LBB83_2
-; GFX11-NEXT: .LBB83_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8bf16_to_v2f64_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s16, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB83_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB83_4
+; GFX11-TRUE16-NEXT: .LBB83_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s4, 0, s3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s4, 0, s2
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v4, 16, 1
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v10, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v9, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v10, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v6, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v1, v12, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v10.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v5.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB83_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB83_2
+; GFX11-TRUE16-NEXT: .LBB83_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8bf16_to_v2f64_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s16, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB83_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB83_4
+; GFX11-FAKE16-NEXT: .LBB83_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s3, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s4, 0, s2
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v3
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v7, v9 :: v_dual_add_nc_u32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v11, v9
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v5, v13 :: v_dual_add_nc_u32 v10, v10, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v6, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v1, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v4, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v7, 16, v8
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB83_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB83_2
+; GFX11-FAKE16-NEXT: .LBB83_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -16408,19 +16731,19 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16
@@ -16435,94 +16758,71 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v6.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2
; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v8.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v6.l, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -18246,83 +18546,75 @@ define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB94_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v1.l
; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v6, 16, 1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v6, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v7, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v5, v7 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v11, v8, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_add3_u32 v12, v12, v7, 0x7fff
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v8, v11, v1, 0x7fff
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v7, v8 :: v_dual_and_b32 v2, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 0x40c00000, v4 :: v_dual_add_f32 v2, 0x40c00000, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v12, v13, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v7, 16, 1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v7
-; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v7, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v4, 16, 1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v9, v12, v2, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v11, v13, v4, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v4
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v11, v12 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v6.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v10, v9, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v2, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v10, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: v_add3_u32 v13, v14, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v13, v14, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add3_u32 v10, v11, v4, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v10, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v9, v15, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v8, v10, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v7
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.h
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v0, 16, v3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v1, 16, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v7, 16, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v4, 16, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v9, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
; GFX11-TRUE16-NEXT: .LBB94_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -18672,104 +18964,191 @@ define inreg <8 x i16> @bitcast_v8bf16_to_v8i16_scalar(<8 x bfloat> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v3, s19
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v8bf16_to_v8i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s16, 0
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB95_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB95_4
-; GFX11-NEXT: .LBB95_2: ; %cmp.true
-; GFX11-NEXT: s_pack_lh_b32_b16 s4, 0, s0
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
-; GFX11-NEXT: s_pack_lh_b32_b16 s4, 0, s1
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s0, s1, 16
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v1
-; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v2
-; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, v5, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s2, 16
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s3, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v10, v4
-; GFX11-NEXT: v_bfe_u32 v10, v7, 16, 1
-; GFX11-NEXT: v_bfe_u32 v12, v9, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v9
-; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v10, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v12, v9
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3
-; GFX11-NEXT: v_bfe_u32 v3, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v5
-; GFX11-NEXT: v_bfe_u32 v8, v11, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v13, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v10, v14, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v11
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v3, v12, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v1
-; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v5, v4
-; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v3, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v6, v8
-; GFX11-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v9
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB95_3:
-; GFX11-NEXT: s_branch .LBB95_2
-; GFX11-NEXT: .LBB95_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8bf16_to_v8i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s16, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB95_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB95_4
+; GFX11-TRUE16-NEXT: .LBB95_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s4, 0, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s4, 0, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s1, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s2, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v10, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v5, v8 :: v_dual_add_nc_u32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v6, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s3, 16
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v2, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v10, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v12, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v11, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v2, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v11
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v10, v14, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v6.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB95_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB95_2
+; GFX11-TRUE16-NEXT: .LBB95_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8bf16_to_v8i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s16, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB95_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB95_4
+; GFX11-FAKE16-NEXT: .LBB95_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s4, 0, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s4, 0, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s1, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v6, v2
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s0, 0, s2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, v5, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s2, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s3, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v10, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v9
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s0, 0, s3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v10, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v12, v9
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v5, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v13, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v10, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v3, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v5, v4
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v3, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v6, v8
+; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v9
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB95_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB95_2
+; GFX11-FAKE16-NEXT: .LBB95_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -19833,19 +20212,19 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16
@@ -19860,94 +20239,71 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB98_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v6.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_2
; GFX11-TRUE16-NEXT: .LBB98_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v8.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v6.l, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -21592,112 +21948,210 @@ define inreg <8 x half> @bitcast_v8bf16_to_v8f16_scalar(<8 x bfloat> inreg %a, i
; GFX9-NEXT: v_mov_b32_e32 v3, s19
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v8bf16_to_v8f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s16, 0
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB103_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB103_4
-; GFX11-NEXT: .LBB103_2: ; %cmp.true
-; GFX11-NEXT: s_pack_lh_b32_b16 s4, 0, s0
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
-; GFX11-NEXT: s_pack_lh_b32_b16 s4, 0, s1
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s4
-; GFX11-NEXT: s_lshl_b32 s0, s1, 16
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v1
-; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v2
-; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, v5, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s2, 16
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s3, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v10, v4
-; GFX11-NEXT: v_bfe_u32 v10, v7, 16, 1
-; GFX11-NEXT: v_bfe_u32 v12, v9, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v9
-; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v10, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v12, v9
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3
-; GFX11-NEXT: v_bfe_u32 v3, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v5
-; GFX11-NEXT: v_bfe_u32 v8, v11, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v13, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v10, v14, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v11
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v7
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v2, v8, 16, v4
-; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v9
-; GFX11-NEXT: v_lshl_or_b32 v3, v3, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v1, v6, 16, v7
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB103_3:
-; GFX11-NEXT: s_branch .LBB103_2
-; GFX11-NEXT: .LBB103_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8bf16_to_v8f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s16, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB103_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB103_4
+; GFX11-TRUE16-NEXT: .LBB103_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s4, 0, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s4, 0, s1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s1, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v2
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s2
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s1, 0, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, v5, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s2, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v10, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v7, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s3, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v5, v11 :: v_dual_add_nc_u32 v10, v10, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v12, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v9, v10, v11 :: v_dual_add_nc_u32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v9.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v7.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB103_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB103_2
+; GFX11-TRUE16-NEXT: .LBB103_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8bf16_to_v8f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s16, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB103_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB103_4
+; GFX11-FAKE16-NEXT: .LBB103_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s4, 0, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s4, 0, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s1, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v6, v2
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s0, 0, s2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, v5, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s2, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s3, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v10, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v9
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s0, 0, s3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v10, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v12, v9
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v5, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v11, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v13, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v10, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v8, 16, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v3, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v6, 16, v7
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB103_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB103_2
+; GFX11-FAKE16-NEXT: .LBB103_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -22745,19 +23199,19 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16
@@ -22772,94 +23226,71 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB106_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v6.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB106_2
; GFX11-TRUE16-NEXT: .LBB106_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v8.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v6.l, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -24483,152 +24914,299 @@ define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v12, v16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v8bf16_to_v16i8_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s16, 0
-; GFX11-NEXT: s_mov_b32 s8, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB109_3
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: s_lshr_b32 s11, s3, 24
-; GFX11-NEXT: s_lshr_b32 s18, s3, 16
-; GFX11-NEXT: s_lshr_b32 s14, s3, 8
-; GFX11-NEXT: s_lshr_b32 s16, s2, 16
-; GFX11-NEXT: s_lshr_b32 s15, s2, 8
-; GFX11-NEXT: s_lshr_b32 s9, s1, 24
-; GFX11-NEXT: s_lshr_b32 s17, s1, 16
-; GFX11-NEXT: s_lshr_b32 s10, s1, 8
-; GFX11-NEXT: s_lshr_b32 s13, s0, 16
-; GFX11-NEXT: s_lshr_b32 s12, s0, 8
-; GFX11-NEXT: s_lshr_b64 s[6:7], s[2:3], 24
-; GFX11-NEXT: s_lshr_b64 s[4:5], s[0:1], 24
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
-; GFX11-NEXT: s_cbranch_vccnz .LBB109_4
-; GFX11-NEXT: .LBB109_2: ; %cmp.true
-; GFX11-NEXT: s_lshl_b32 s4, s1, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s1, 0, s1
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
-; GFX11-NEXT: s_pack_lh_b32_b16 s4, 0, s0
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: s_lshl_b32 s0, s3, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s1, 0, s3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v8 :: v_dual_add_nc_u32 v7, v7, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s2, 16
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s2
-; GFX11-NEXT: v_bfe_u32 v10, v7, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v7
-; GFX11-NEXT: v_bfe_u32 v12, v9, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v10, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v2
-; GFX11-NEXT: v_bfe_u32 v2, v3, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v12, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v3
-; GFX11-NEXT: v_bfe_u32 v8, v11, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v10, v14, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v11
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v16
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v9
-; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v11
-; GFX11-NEXT: v_lshl_or_b32 v10, v14, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v15, 24, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v10
-; GFX11-NEXT: v_lshl_or_b32 v9, v3, 16, v7
-; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2]
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10]
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v9
-; GFX11-NEXT: s_branch .LBB109_5
-; GFX11-NEXT: .LBB109_3:
-; GFX11-NEXT: ; implicit-def: $sgpr12
-; GFX11-NEXT: ; implicit-def: $sgpr13
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr10
-; GFX11-NEXT: ; implicit-def: $sgpr17
-; GFX11-NEXT: ; implicit-def: $sgpr9
-; GFX11-NEXT: ; implicit-def: $sgpr15
-; GFX11-NEXT: ; implicit-def: $sgpr16
-; GFX11-NEXT: ; implicit-def: $sgpr6
-; GFX11-NEXT: ; implicit-def: $sgpr14
-; GFX11-NEXT: ; implicit-def: $sgpr18
-; GFX11-NEXT: ; implicit-def: $sgpr11
-; GFX11-NEXT: s_branch .LBB109_2
-; GFX11-NEXT: .LBB109_4:
-; GFX11-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v17, s1
-; GFX11-NEXT: v_dual_mov_b32 v16, s3 :: v_dual_mov_b32 v9, s15
-; GFX11-NEXT: v_dual_mov_b32 v14, s18 :: v_dual_mov_b32 v15, s11
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v13, s14
-; GFX11-NEXT: v_dual_mov_b32 v6, s17 :: v_dual_mov_b32 v1, s12
-; GFX11-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v7, s9
-; GFX11-NEXT: v_dual_mov_b32 v2, s13 :: v_dual_mov_b32 v5, s10
-; GFX11-NEXT: v_mov_b32_e32 v11, s6
-; GFX11-NEXT: v_mov_b32_e32 v3, s4
-; GFX11-NEXT: .LBB109_5: ; %end
-; GFX11-NEXT: v_mov_b32_e32 v4, v17
-; GFX11-NEXT: v_mov_b32_e32 v12, v16
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8bf16_to_v16i8_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s16, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB109_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s3, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s18, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s3, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s16, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s2, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s1, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s17, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s1, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s0, 8
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB109_4
+; GFX11-TRUE16-NEXT: .LBB109_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s1, 16
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s1, 0, s1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s1, 0, s0
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s4, 0, s3
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v7
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s2, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s3, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v3, v2 :: v_dual_add_nc_u32 v1, v8, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v17.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v6.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v14.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v16.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v3.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v9
+; GFX11-TRUE16-NEXT: s_branch .LBB109_5
+; GFX11-TRUE16-NEXT: .LBB109_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr17
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr16
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr18
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11
+; GFX11-TRUE16-NEXT: s_branch .LBB109_2
+; GFX11-TRUE16-NEXT: .LBB109_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v17, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s3 :: v_dual_mov_b32 v9, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s18 :: v_dual_mov_b32 v15, s11
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v13, s14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s17 :: v_dual_mov_b32 v1, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v7, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s13 :: v_dual_mov_b32 v5, s10
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s6
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4
+; GFX11-TRUE16-NEXT: .LBB109_5: ; %end
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v17
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v12, v16
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8bf16_to_v16i8_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s16, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB109_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s3, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s18, s3, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s3, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s16, s2, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s2, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s1, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s17, s1, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s1, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s0, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s0, 8
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB109_4
+; GFX11-FAKE16-NEXT: .LBB109_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s1, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s1, 0, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s4, 0, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s3, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s1, 0, s3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v2, v8 :: v_dual_add_nc_u32 v7, v7, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s2, 16
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s0, 0, s2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v10, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v12, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v10, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v6, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v4, 16, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v14, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v3, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v9
+; GFX11-FAKE16-NEXT: s_branch .LBB109_5
+; GFX11-FAKE16-NEXT: .LBB109_3:
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr17
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr16
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr18
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11
+; GFX11-FAKE16-NEXT: s_branch .LBB109_2
+; GFX11-FAKE16-NEXT: .LBB109_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v17, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s3 :: v_dual_mov_b32 v9, s15
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s18 :: v_dual_mov_b32 v15, s11
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v13, s14
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s17 :: v_dual_mov_b32 v1, s12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v7, s9
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s13 :: v_dual_mov_b32 v5, s10
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, s6
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4
+; GFX11-FAKE16-NEXT: .LBB109_5: ; %end
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v17
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, v16
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -24960,19 +25538,19 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16
@@ -24987,94 +25565,71 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB110_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v6.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB110_2
; GFX11-TRUE16-NEXT: .LBB110_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v8.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v6.l, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
index cb4b3bd..01e397d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
@@ -4485,203 +4485,384 @@ define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v7, s23
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v16bf16_to_v8i32_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s7, s19
-; GFX11-NEXT: s_mov_b32 s6, s18
-; GFX11-NEXT: s_mov_b32 s5, s17
-; GFX11-NEXT: s_mov_b32 s4, s16
-; GFX11-NEXT: s_cmp_lg_u32 s20, 0
-; GFX11-NEXT: s_mov_b32 s8, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB23_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
-; GFX11-NEXT: s_cbranch_vccnz .LBB23_4
-; GFX11-NEXT: .LBB23_2: ; %cmp.true
-; GFX11-NEXT: s_lshl_b32 s8, s7, 16
-; GFX11-NEXT: s_and_b32 s7, s7, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s8
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s7
-; GFX11-NEXT: s_and_b32 s8, s6, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s6, s6, 16
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s8
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: s_and_b32 s7, s5, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s7
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_lshl_b32 s5, s5, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5
-; GFX11-NEXT: v_bfe_u32 v11, v6, 16, 1
-; GFX11-NEXT: s_and_b32 s5, s4, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v9, v7, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v3, v8, v4 :: v_dual_add_nc_u32 v4, v9, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v11, v6
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v5, 0x7fff, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v9, v10, 16, 1
-; GFX11-NEXT: v_bfe_u32 v6, v8, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: s_and_b32 s4, s3, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v9, v10
-; GFX11-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s3
-; GFX11-NEXT: s_and_b32 s3, s2, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v11, v6, v8
-; GFX11-NEXT: v_lshl_or_b32 v6, v3, 16, v2
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v10
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v8
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s3
-; GFX11-NEXT: v_bfe_u32 v3, v9, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v4, 0xffff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v10, v8, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v4
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s4
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s2
-; GFX11-NEXT: s_and_b32 s2, s1, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v9
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s1, s0, 16
-; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v11, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v10
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_bfe_u32 v11, v4, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_bfe_u32 v16, v14, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_dual_cndmask_b32 v8, v9, v10 :: v_dual_add_nc_u32 v9, v11, v4
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s2
-; GFX11-NEXT: v_add_nc_u32_e32 v16, v16, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v12, v10, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v10
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v9, v11, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s1
-; GFX11-NEXT: v_bfe_u32 v11, v13, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v12, v10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v15, v9, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v13
-; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v16
-; GFX11-NEXT: v_add_nc_u32_e32 v15, v15, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
-; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v18, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v15, v19, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GFX11-NEXT: v_cndmask_b32_e32 v10, v12, v17, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX11-NEXT: v_cndmask_b32_e32 v12, v13, v16, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v4
-; GFX11-NEXT: v_lshl_or_b32 v4, v1, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v3, v2, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v1, v10, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v2, v8, 16, v13
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_lshl_or_b32 v0, v12, 16, v9
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB23_3:
-; GFX11-NEXT: s_branch .LBB23_2
-; GFX11-NEXT: .LBB23_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16bf16_to_v8i32_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, s19
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, s18
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s17
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s16
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s20, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB23_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB23_4
+; GFX11-TRUE16-NEXT: .LBB23_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s7, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s8
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s7
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s6, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: s_and_b32 s6, s5, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v6 :: v_dual_add_nc_u32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v9, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v6, v8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s4, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v2
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s3, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v9
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_and_b32 s3, s2, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v11, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v3, v8 :: v_dual_add_nc_u32 v3, v9, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s1, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v10, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s0, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v11
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v10
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v13, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, v14, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v10, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v10, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v1, v16, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v8
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v14.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v9.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB23_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB23_2
+; GFX11-TRUE16-NEXT: .LBB23_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16bf16_to_v8i32_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, s19
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, s18
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s17
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s16
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s20, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB23_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB23_4
+; GFX11-FAKE16-NEXT: .LBB23_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s7, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s7
+; GFX11-FAKE16-NEXT: s_and_b32 s8, s6, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s5, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v6, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s4, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v8, v4 :: v_dual_add_nc_u32 v4, v9, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v11, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s3, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v9, v10
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: s_and_b32 s3, s2, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v6, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v3, 16, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v5, 16, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v10, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s1, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v9
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v14, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v9, v10 :: v_dual_add_nc_u32 v9, v11, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, v16, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v9, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v12, v10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v11, v13
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, v15, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v18, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v15, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v12, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v13, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v2, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v10, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v8, 16, v13
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v12, 16, v9
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB23_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB23_2
+; GFX11-FAKE16-NEXT: .LBB23_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -6298,31 +6479,33 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v13.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v3.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v23.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v25.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v22.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v31.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_3
@@ -6335,48 +6518,43 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v0.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v14.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v1.h, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v15.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v2.l, v14.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v21.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v3.l, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v10.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v4.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v10.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
@@ -6387,122 +6565,88 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v5.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v6.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v7.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2
; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v17.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v16.h, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v19.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v18.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v14.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v13.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v19.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v18.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v17.h, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v15.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v14.h, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v15.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v12.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v13.l, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v11.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v11.h, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v10.h, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v9.h, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.h, v6.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v17.l, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v15.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v15.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v16.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v10.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v11.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v12.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v8.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v9.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.h, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v10.l, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -11493,203 +11637,384 @@ define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a
; GFX9-NEXT: v_mov_b32_e32 v7, s23
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v16bf16_to_v8f32_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s7, s19
-; GFX11-NEXT: s_mov_b32 s6, s18
-; GFX11-NEXT: s_mov_b32 s5, s17
-; GFX11-NEXT: s_mov_b32 s4, s16
-; GFX11-NEXT: s_cmp_lg_u32 s20, 0
-; GFX11-NEXT: s_mov_b32 s8, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB47_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
-; GFX11-NEXT: s_cbranch_vccnz .LBB47_4
-; GFX11-NEXT: .LBB47_2: ; %cmp.true
-; GFX11-NEXT: s_lshl_b32 s8, s7, 16
-; GFX11-NEXT: s_and_b32 s7, s7, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s8
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s7
-; GFX11-NEXT: s_and_b32 s8, s6, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s6, s6, 16
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s8
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: s_and_b32 s7, s5, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s7
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_lshl_b32 s5, s5, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5
-; GFX11-NEXT: v_bfe_u32 v11, v6, 16, 1
-; GFX11-NEXT: s_and_b32 s5, s4, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v9, v7, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v3, v8, v4 :: v_dual_add_nc_u32 v4, v9, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v11, v6
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v5, 0x7fff, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v9, v10, 16, 1
-; GFX11-NEXT: v_bfe_u32 v6, v8, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: s_and_b32 s4, s3, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v9, v10
-; GFX11-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s3
-; GFX11-NEXT: s_and_b32 s3, s2, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v11, v6, v8
-; GFX11-NEXT: v_lshl_or_b32 v6, v3, 16, v2
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v10
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v8
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s3
-; GFX11-NEXT: v_bfe_u32 v3, v9, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v4, 0xffff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v10, v8, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v4
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s4
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s2
-; GFX11-NEXT: s_and_b32 s2, s1, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v9
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s1, s0, 16
-; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v11, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v10
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_bfe_u32 v11, v4, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_bfe_u32 v16, v14, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_dual_cndmask_b32 v8, v9, v10 :: v_dual_add_nc_u32 v9, v11, v4
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s2
-; GFX11-NEXT: v_add_nc_u32_e32 v16, v16, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v12, v10, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v10
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v9, v11, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s1
-; GFX11-NEXT: v_bfe_u32 v11, v13, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v12, v10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v15, v9, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v13
-; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v16
-; GFX11-NEXT: v_add_nc_u32_e32 v15, v15, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
-; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v18, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v15, v19, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GFX11-NEXT: v_cndmask_b32_e32 v10, v12, v17, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX11-NEXT: v_cndmask_b32_e32 v12, v13, v16, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v4
-; GFX11-NEXT: v_lshl_or_b32 v4, v1, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v3, v2, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v1, v10, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v2, v8, 16, v13
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_lshl_or_b32 v0, v12, 16, v9
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB47_3:
-; GFX11-NEXT: s_branch .LBB47_2
-; GFX11-NEXT: .LBB47_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16bf16_to_v8f32_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, s19
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, s18
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s17
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s16
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s20, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_4
+; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s7, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s8
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s7
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s6, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: s_and_b32 s6, s5, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v6 :: v_dual_add_nc_u32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v9, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v6, v8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s4, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v2
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s3, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v9
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_and_b32 s3, s2, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v11, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v3, v8 :: v_dual_add_nc_u32 v3, v9, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s1, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v10, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s0, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v11
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v10
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v13, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, v14, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v10, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v10, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v1, v16, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v8
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v14.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v9.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB47_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB47_2
+; GFX11-TRUE16-NEXT: .LBB47_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16bf16_to_v8f32_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, s19
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, s18
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s17
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s16
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s20, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB47_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB47_4
+; GFX11-FAKE16-NEXT: .LBB47_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s7, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s7
+; GFX11-FAKE16-NEXT: s_and_b32 s8, s6, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s5, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v6, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s4, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v8, v4 :: v_dual_add_nc_u32 v4, v9, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v11, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s3, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v9, v10
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: s_and_b32 s3, s2, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v6, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v3, 16, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v5, 16, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v10, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s1, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v9
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v14, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v9, v10 :: v_dual_add_nc_u32 v9, v11, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, v16, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v9, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v12, v10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v11, v13
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, v15, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v18, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v15, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v12, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v13, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v2, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v10, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v8, 16, v13
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v12, 16, v9
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB47_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB47_2
+; GFX11-FAKE16-NEXT: .LBB47_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -13349,31 +13674,33 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v13.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v3.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v23.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v25.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v22.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v31.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_3
@@ -13386,48 +13713,43 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v0.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v14.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v1.h, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v15.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v2.l, v14.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v21.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v3.l, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v10.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v4.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v10.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
@@ -13438,122 +13760,88 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v5.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v6.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v7.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2
; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v17.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v16.h, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v19.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v18.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v14.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v13.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v19.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v18.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v17.h, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v15.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v14.h, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v15.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v12.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v13.l, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v11.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v11.h, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v10.h, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v9.h, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.h, v6.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v17.l, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v15.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v15.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v16.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v10.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v11.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v12.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v8.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v9.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.h, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v10.l, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -18069,203 +18357,384 @@ define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v7, s23
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v16bf16_to_v4i64_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s7, s19
-; GFX11-NEXT: s_mov_b32 s6, s18
-; GFX11-NEXT: s_mov_b32 s5, s17
-; GFX11-NEXT: s_mov_b32 s4, s16
-; GFX11-NEXT: s_cmp_lg_u32 s20, 0
-; GFX11-NEXT: s_mov_b32 s8, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB67_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
-; GFX11-NEXT: s_cbranch_vccnz .LBB67_4
-; GFX11-NEXT: .LBB67_2: ; %cmp.true
-; GFX11-NEXT: s_lshl_b32 s8, s7, 16
-; GFX11-NEXT: s_and_b32 s7, s7, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s8
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s7
-; GFX11-NEXT: s_and_b32 s8, s6, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s6, s6, 16
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s8
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: s_and_b32 s7, s5, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s7
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_lshl_b32 s5, s5, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5
-; GFX11-NEXT: v_bfe_u32 v11, v6, 16, 1
-; GFX11-NEXT: s_and_b32 s5, s4, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v9, v7, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v3, v8, v4 :: v_dual_add_nc_u32 v4, v9, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v11, v6
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v5, 0x7fff, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v9, v10, 16, 1
-; GFX11-NEXT: v_bfe_u32 v6, v8, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: s_and_b32 s4, s3, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v9, v10
-; GFX11-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s3
-; GFX11-NEXT: s_and_b32 s3, s2, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v11, v6, v8
-; GFX11-NEXT: v_lshl_or_b32 v6, v3, 16, v2
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v10
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v8
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s3
-; GFX11-NEXT: v_bfe_u32 v3, v9, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v4, 0xffff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v10, v8, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v4
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s4
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s2
-; GFX11-NEXT: s_and_b32 s2, s1, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v9
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s1, s0, 16
-; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v11, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v10
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_bfe_u32 v11, v4, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_bfe_u32 v16, v14, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_dual_cndmask_b32 v8, v9, v10 :: v_dual_add_nc_u32 v9, v11, v4
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s2
-; GFX11-NEXT: v_add_nc_u32_e32 v16, v16, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v12, v10, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v10
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v9, v11, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s1
-; GFX11-NEXT: v_bfe_u32 v11, v13, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v12, v10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v15, v9, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v13
-; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v16
-; GFX11-NEXT: v_add_nc_u32_e32 v15, v15, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
-; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v18, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v15, v19, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GFX11-NEXT: v_cndmask_b32_e32 v10, v12, v17, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX11-NEXT: v_cndmask_b32_e32 v12, v13, v16, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v4
-; GFX11-NEXT: v_lshl_or_b32 v4, v1, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v3, v2, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v1, v10, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v2, v8, 16, v13
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_lshl_or_b32 v0, v12, 16, v9
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB67_3:
-; GFX11-NEXT: s_branch .LBB67_2
-; GFX11-NEXT: .LBB67_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16bf16_to_v4i64_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, s19
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, s18
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s17
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s16
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s20, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB67_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB67_4
+; GFX11-TRUE16-NEXT: .LBB67_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s7, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s8
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s7
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s6, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: s_and_b32 s6, s5, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v6 :: v_dual_add_nc_u32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v9, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v6, v8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s4, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v2
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s3, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v9
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_and_b32 s3, s2, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v11, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v3, v8 :: v_dual_add_nc_u32 v3, v9, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s1, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v10, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s0, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v11
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v10
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v13, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, v14, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v10, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v10, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v1, v16, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v8
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v14.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v9.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB67_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB67_2
+; GFX11-TRUE16-NEXT: .LBB67_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16bf16_to_v4i64_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, s19
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, s18
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s17
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s16
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s20, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB67_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB67_4
+; GFX11-FAKE16-NEXT: .LBB67_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s7, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s7
+; GFX11-FAKE16-NEXT: s_and_b32 s8, s6, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s5, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v6, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s4, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v8, v4 :: v_dual_add_nc_u32 v4, v9, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v11, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s3, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v9, v10
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: s_and_b32 s3, s2, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v6, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v3, 16, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v5, 16, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v10, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s1, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v9
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v14, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v9, v10 :: v_dual_add_nc_u32 v9, v11, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, v16, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v9, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v12, v10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v11, v13
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, v15, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v18, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v15, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v12, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v13, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v2, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v10, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v8, 16, v13
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v12, 16, v9
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB67_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB67_2
+; GFX11-FAKE16-NEXT: .LBB67_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -19888,31 +20357,33 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v13.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v3.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v23.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v25.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v22.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v31.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_3
@@ -19925,48 +20396,43 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v0.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v14.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v1.h, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v15.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v2.l, v14.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v21.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v3.l, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v10.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v4.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v10.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
@@ -19977,122 +20443,88 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v5.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v6.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v7.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2
; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v17.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v16.h, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v19.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v18.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v14.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v13.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v19.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v18.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v17.h, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v15.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v14.h, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v15.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v12.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v13.l, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v11.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v11.h, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v10.h, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v9.h, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.h, v6.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v17.l, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v15.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v15.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v16.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v10.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v11.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v12.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v8.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v9.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.h, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v10.l, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -24093,203 +24525,384 @@ define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg %
; GFX9-NEXT: v_mov_b32_e32 v7, s23
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v16bf16_to_v4f64_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s7, s19
-; GFX11-NEXT: s_mov_b32 s6, s18
-; GFX11-NEXT: s_mov_b32 s5, s17
-; GFX11-NEXT: s_mov_b32 s4, s16
-; GFX11-NEXT: s_cmp_lg_u32 s20, 0
-; GFX11-NEXT: s_mov_b32 s8, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB83_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
-; GFX11-NEXT: s_cbranch_vccnz .LBB83_4
-; GFX11-NEXT: .LBB83_2: ; %cmp.true
-; GFX11-NEXT: s_lshl_b32 s8, s7, 16
-; GFX11-NEXT: s_and_b32 s7, s7, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s8
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s7
-; GFX11-NEXT: s_and_b32 s8, s6, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s6, s6, 16
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s8
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: s_and_b32 s7, s5, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s7
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_lshl_b32 s5, s5, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5
-; GFX11-NEXT: v_bfe_u32 v11, v6, 16, 1
-; GFX11-NEXT: s_and_b32 s5, s4, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3
-; GFX11-NEXT: v_bfe_u32 v9, v7, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v3, v8, v4 :: v_dual_add_nc_u32 v4, v9, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v11, v6
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v5, 0x7fff, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v9, v10, 16, 1
-; GFX11-NEXT: v_bfe_u32 v6, v8, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: s_and_b32 s4, s3, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v9, v10
-; GFX11-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s3
-; GFX11-NEXT: s_and_b32 s3, s2, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v11, v6, v8
-; GFX11-NEXT: v_lshl_or_b32 v6, v3, 16, v2
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v10
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v8
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s3
-; GFX11-NEXT: v_bfe_u32 v3, v9, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v4, 0xffff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v10, v8, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v4
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s4
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s2
-; GFX11-NEXT: s_and_b32 s2, s1, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v9
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s1, s0, 16
-; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v11, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v10
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_bfe_u32 v11, v4, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_bfe_u32 v16, v14, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_dual_cndmask_b32 v8, v9, v10 :: v_dual_add_nc_u32 v9, v11, v4
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s2
-; GFX11-NEXT: v_add_nc_u32_e32 v16, v16, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v12, v10, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v10
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v9, v11, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s1
-; GFX11-NEXT: v_bfe_u32 v11, v13, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v12, v10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v15, v9, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v13
-; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v16
-; GFX11-NEXT: v_add_nc_u32_e32 v15, v15, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
-; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v18, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v15, v19, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GFX11-NEXT: v_cndmask_b32_e32 v10, v12, v17, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX11-NEXT: v_cndmask_b32_e32 v12, v13, v16, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v4
-; GFX11-NEXT: v_lshl_or_b32 v4, v1, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v3, v2, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v1, v10, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v2, v8, 16, v13
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_lshl_or_b32 v0, v12, 16, v9
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB83_3:
-; GFX11-NEXT: s_branch .LBB83_2
-; GFX11-NEXT: .LBB83_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16bf16_to_v4f64_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, s19
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, s18
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s17
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s16
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s20, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB83_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB83_4
+; GFX11-TRUE16-NEXT: .LBB83_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s7, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s8
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s7
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s6, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: s_and_b32 s6, s5, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v6 :: v_dual_add_nc_u32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v9, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v6, v8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s4, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v2
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s3, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v9
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_and_b32 s3, s2, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v11, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v3, v8 :: v_dual_add_nc_u32 v3, v9, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s1, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v10, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s0, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v11
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v10
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v13, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, v14, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v10, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v10, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v1, v16, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v8
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v14.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v9.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB83_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB83_2
+; GFX11-TRUE16-NEXT: .LBB83_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16bf16_to_v4f64_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, s19
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, s18
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s17
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s16
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s20, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB83_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB83_4
+; GFX11-FAKE16-NEXT: .LBB83_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s7, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s7
+; GFX11-FAKE16-NEXT: s_and_b32 s8, s6, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s5, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v6, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s4, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v8, v4 :: v_dual_add_nc_u32 v4, v9, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v11, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s3, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v9, v10
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: s_and_b32 s3, s2, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v6, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v3, 16, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v5, 16, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v10, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s1, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v9
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v14, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v9, v10 :: v_dual_add_nc_u32 v9, v11, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, v16, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v9, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v12, v10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v11, v13
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, v15, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v18, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v15, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v12, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v13, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v2, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v10, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v8, 16, v13
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v12, 16, v9
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB83_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB83_2
+; GFX11-FAKE16-NEXT: .LBB83_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -25929,31 +26542,33 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v13.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v3.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v23.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v25.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v22.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v31.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_3
@@ -25966,48 +26581,43 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v0.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v14.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v1.h, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v15.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v2.l, v14.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v21.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v3.l, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v10.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v4.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v10.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
@@ -26018,122 +26628,88 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v5.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v6.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v7.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2
; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v17.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v16.h, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v19.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v18.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v14.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v13.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v19.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v18.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v17.h, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v15.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v14.h, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v15.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v12.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v13.l, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v11.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v11.h, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v10.h, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v9.h, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.h, v6.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v17.l, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v15.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v15.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v16.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v10.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v11.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v12.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v8.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v9.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.h, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v10.l, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -28870,13 +29446,10 @@ define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB94_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_add_f32 v8, 0x40c00000, v8
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_lshlrev_b32 v8, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v9, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v8, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v8
@@ -28884,142 +29457,128 @@ define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v9
; GFX11-TRUE16-NEXT: v_add3_u32 v14, v14, v9, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v8, 0x7fff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v6, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v11, v12, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v8, v11, v12 :: v_dual_and_b32 v1, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v10, 16, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v1, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_dual_add_f32 v12, 0x40c00000, v12 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v1, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v0, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v0
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add3_u32 v13, v13, v0, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v13, v15, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v1
; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v8.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v14, v16, vcc_lo
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v4
; GFX11-TRUE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v10, 16, 1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v11, v13, vcc_lo
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v10
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v9.h
; GFX11-TRUE16-NEXT: v_add3_u32 v11, v14, v10, 0x7fff
; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v2, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v11, v13, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v11, v14, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v2
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_add3_u32 v14, v15, v12, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v11, v13, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v3, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h
; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc_lo
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v5
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v11, v15, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v4, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_add_f32 v5, 0x40c00000, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v4, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v14
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v11, v18, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v11, v14, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v15
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v16
-; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v18, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v13, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v13
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v15, v16, v13, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v14, 16, 1
-; GFX11-TRUE16-NEXT: v_add3_u32 v16, v16, v14, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v21 :: v_dual_cndmask_b32 v11, v16, v19
-; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v14, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v14
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v11, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_add3_u32 v14, v16, v13, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v5, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add3_u32 v14, v16, v4, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v16, v17, v11, 0x7fff
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v14, v15 :: v_dual_lshlrev_b32 v17, 16, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT: v_add3_u32 v15, v19, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v13.h
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v11, v16, v18 :: v_dual_lshlrev_b32 v18, 16, v7
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v17
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v14, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v15, v16, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v18
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-TRUE16-NEXT: v_add3_u32 v16, v20, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v5
-; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v14, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v19, v21, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v19, v22, v6, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v21, v23, v18, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v18
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-TRUE16-NEXT: v_add3_u32 v23, v24, v7, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v7
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v21, v22, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v23, v24, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v16, v17, v14, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v14
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v15, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v11.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v16, v17, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add3_u32 v16, v16, v7, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v17, v18, v6, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v18, v19, v15, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v15
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v18, v19, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v19, v25, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v18
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v6.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v16, v20, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v0, 16, v7
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v11
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v15, v17, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v1, 16, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v2, 16, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v11
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v0.h
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v1, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v2, 16, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v11, 16, v9
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v12, 16, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v17, v20, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v14.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v16, v21, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h
; GFX11-TRUE16-NEXT: .LBB94_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -29645,175 +30204,334 @@ define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a
; GFX9-NEXT: v_mov_b32_e32 v7, s23
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v16bf16_to_v16i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s7, s19
-; GFX11-NEXT: s_mov_b32 s6, s18
-; GFX11-NEXT: s_mov_b32 s5, s17
-; GFX11-NEXT: s_mov_b32 s4, s16
-; GFX11-NEXT: s_cmp_lg_u32 s20, 0
-; GFX11-NEXT: s_mov_b32 s8, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB95_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
-; GFX11-NEXT: s_cbranch_vccnz .LBB95_4
-; GFX11-NEXT: .LBB95_2: ; %cmp.true
-; GFX11-NEXT: s_and_b32 s8, s0, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s8
-; GFX11-NEXT: s_and_b32 s8, s1, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s8
-; GFX11-NEXT: s_lshl_b32 s0, s1, 16
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: s_and_b32 s1, s2, 0xffff0000
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v2
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: s_lshl_b32 s0, s2, 16
-; GFX11-NEXT: s_and_b32 s1, s5, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v8 :: v_dual_add_nc_u32 v5, 0x7fff, v5
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v7, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v8, v6
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v5, v3, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v4, v7
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s3, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: s_lshl_b32 s0, s3, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v10, v4
-; GFX11-NEXT: v_bfe_u32 v12, v5, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v8, v3, v11, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: s_and_b32 s0, s4, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v12, v5
-; GFX11-NEXT: v_bfe_u32 v12, v7, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s4, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v10, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v12, v7
-; GFX11-NEXT: v_bfe_u32 v14, v10, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7
-; GFX11-NEXT: s_lshl_b32 s0, s5, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v14, v10
-; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v10
-; GFX11-NEXT: v_bfe_u32 v15, v11, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v11
-; GFX11-NEXT: v_cndmask_b32_e32 v12, v5, v12, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v14, v15, v11
-; GFX11-NEXT: s_and_b32 s0, s6, 0xffff0000
-; GFX11-NEXT: v_bfe_u32 v16, v5, 16, 1
-; GFX11-NEXT: v_bfe_u32 v6, v13, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v10, v7, v17, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s6, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v16, v16, v5
-; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s7, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v13
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v11, v14, v18 :: v_dual_add_nc_u32 v14, 0x7fff, v16
-; GFX11-NEXT: v_bfe_u32 v16, v7, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v5
-; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v20, v17, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v16, v16, v7
-; GFX11-NEXT: s_and_b32 s0, s7, 0xffff0000
-; GFX11-NEXT: v_bfe_u32 v22, v19, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v21, 0x40c00000, s0
-; GFX11-NEXT: v_dual_cndmask_b32 v5, v14, v18 :: v_dual_add_nc_u32 v14, v20, v17
-; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v17
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v20, v22, v19
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: v_bfe_u32 v18, v21, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
-; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v19
-; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20
-; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v16
-; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v7
-; GFX11-NEXT: v_cndmask_b32_e32 v14, v14, v23, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT: v_add_nc_u32_e32 v18, v18, v21
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX11-NEXT: v_cndmask_b32_e32 v19, v20, v24, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v18
-; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v22, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_and_or_b32 v5, 0xffff0000, v11, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v17, v18, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v19
-; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v3, v11
-; GFX11-NEXT: v_cndmask_b32_e32 v13, v6, v15, vcc_lo
-; GFX11-NEXT: v_and_or_b32 v6, 0xffff0000, v16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v1
-; GFX11-NEXT: v_and_or_b32 v7, 0xffff0000, v7, v17
-; GFX11-NEXT: v_and_or_b32 v4, 0xffff0000, v13, v10
-; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v8, v12
-; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v9, v14
-; GFX11-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v15
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB95_3:
-; GFX11-NEXT: s_branch .LBB95_2
-; GFX11-NEXT: .LBB95_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16bf16_to_v16i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, s19
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, s18
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s17
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s16
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s20, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB95_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB95_4
+; GFX11-TRUE16-NEXT: .LBB95_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s0, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s8
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s1, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s1, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s2, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v10, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s2, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s5, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v8.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v6, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s3, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s3, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v2, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v2, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s4, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v10.h
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v3, v4 :: v_dual_add_nc_u32 v3, v5, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v11
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v12
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s4, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v4, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v11, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v11
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s5, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v15, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v15
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s6, 0xffff0000
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v12.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v5, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s6, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v14.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v13.h
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v6, v16 :: v_dual_add_nc_u32 v6, 0x7fff, v11
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s7, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v15
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v11, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s7, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v17, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v6, v16, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v18, v11
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, v20, v17
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v9, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v19, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v6, v21, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, v16, v19
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v19
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v18, v22, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v20, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v11.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v16, v21, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v17.h
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB95_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB95_2
+; GFX11-TRUE16-NEXT: .LBB95_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16bf16_to_v16i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, s19
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, s18
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s17
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s16
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s20, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB95_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB95_4
+; GFX11-FAKE16-NEXT: .LBB95_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_and_b32 s8, s0, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s8
+; GFX11-FAKE16-NEXT: s_and_b32 s8, s1, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s1, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s2, 0xffff0000
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s2, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s5, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v3, v8 :: v_dual_add_nc_u32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v8, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v5, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v4, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s3, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s3, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v10, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v3, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s4, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v12, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s4, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v12, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v7
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s5, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v14, v10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v5, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v15, v11
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s6, 0xffff0000
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v7, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s6, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, v16, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s7, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v6, v13
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v14, v18 :: v_dual_add_nc_u32 v14, 0x7fff, v16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v20, v17, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, v16, v7
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s7, 0xffff0000
+; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v19, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v21, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v14, v18 :: v_dual_add_nc_u32 v14, v20, v17
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v17
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, v22, v19
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: v_bfe_u32 v18, v21, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, 0x400000, v19
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v14, v23, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, v18, v21
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v20, v24, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v22, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, 0xffff0000, v11, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v17, v18, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v19
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v3, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v6, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_or_b32 v6, 0xffff0000, v16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v1
+; GFX11-FAKE16-NEXT: v_and_or_b32 v7, 0xffff0000, v7, v17
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, 0xffff0000, v13, v10
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v8, v12
+; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v9, v14
+; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v15
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB95_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB95_2
+; GFX11-FAKE16-NEXT: .LBB95_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -32610,177 +33328,351 @@ define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX9-NEXT: s_branch .LBB99_2
;
-; GFX11-LABEL: bitcast_v32i8_to_v16i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v0
-; GFX11-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v15, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v14, 8, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v20, 8, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v19, 8, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v7
-; GFX11-NEXT: v_lshlrev_b32_e32 v9, 8, v9
-; GFX11-NEXT: v_lshlrev_b32_e32 v11, 8, v11
-; GFX11-NEXT: v_lshlrev_b32_e32 v13, 8, v13
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB99_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: s_and_b32 s5, s0, 0xff
-; GFX11-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-NEXT: s_and_b32 s7, s2, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s3, 8
-; GFX11-NEXT: s_or_b32 s5, s5, s6
-; GFX11-NEXT: s_or_b32 s6, s7, s8
-; GFX11-NEXT: s_and_b32 s7, s16, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s17, 8
-; GFX11-NEXT: s_and_b32 s9, s18, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s19, 8
-; GFX11-NEXT: s_or_b32 s7, s7, s8
-; GFX11-NEXT: s_or_b32 s8, s9, s10
-; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-NEXT: s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-NEXT: s_and_b32 s9, s22, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s23, 8
-; GFX11-NEXT: s_or_b32 s7, s7, s8
-; GFX11-NEXT: s_or_b32 s8, s9, s10
-; GFX11-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s25, 8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v15
-; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-NEXT: s_or_b32 s8, s9, s10
-; GFX11-NEXT: s_and_b32 s9, s26, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s27, 8
-; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v16
-; GFX11-NEXT: s_or_b32 s9, s9, s10
-; GFX11-NEXT: v_or_b32_e32 v1, v1, v20
-; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v10
-; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v18
-; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v17
-; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v8
-; GFX11-NEXT: v_or_b32_e32 v4, v4, v21
-; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v12
-; GFX11-NEXT: v_or_b32_e32 v2, v2, v19
-; GFX11-NEXT: v_or_b32_e32 v5, v5, v11
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: s_and_b32 s11, s28, 0xff
-; GFX11-NEXT: s_lshl_b32 s12, s29, 8
-; GFX11-NEXT: v_or_b32_e32 v6, v6, v9
-; GFX11-NEXT: s_or_b32 s10, s11, s12
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e64 v3, 0xffff, s10
-; GFX11-NEXT: v_or_b32_e32 v7, v7, v13
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v5
-; GFX11-NEXT: v_lshl_or_b32 v5, v2, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, s6
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v14
-; GFX11-NEXT: v_lshl_or_b32 v6, v6, 16, v22
-; GFX11-NEXT: v_lshl_or_b32 v7, v7, 16, v23
-; GFX11-NEXT: v_mov_b32_e32 v2, s7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v4, v0, 16, v3
-; GFX11-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s8
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB99_3
-; GFX11-NEXT: .LBB99_2: ; %cmp.true
-; GFX11-NEXT: s_add_i32 s28, s28, 3
-; GFX11-NEXT: s_lshl_b32 s5, s29, 8
-; GFX11-NEXT: s_and_b32 s4, s28, 0xff
-; GFX11-NEXT: s_add_i32 s24, s24, 3
-; GFX11-NEXT: s_or_b32 s4, s5, s4
-; GFX11-NEXT: s_and_b32 s5, s24, 0xff
-; GFX11-NEXT: s_lshl_b32 s6, s25, 8
-; GFX11-NEXT: s_add_i32 s26, s26, 3
-; GFX11-NEXT: s_or_b32 s5, s6, s5
-; GFX11-NEXT: s_and_b32 s6, s26, 0xff
-; GFX11-NEXT: s_lshl_b32 s7, s27, 8
-; GFX11-NEXT: s_add_i32 s20, s20, 3
-; GFX11-NEXT: s_or_b32 s6, s7, s6
-; GFX11-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-NEXT: s_add_i32 s22, s22, 3
-; GFX11-NEXT: s_or_b32 s7, s8, s7
-; GFX11-NEXT: s_and_b32 s8, s22, 0xff
-; GFX11-NEXT: s_lshl_b32 s9, s23, 8
-; GFX11-NEXT: s_add_i32 s16, s16, 3
-; GFX11-NEXT: s_or_b32 s8, s9, s8
-; GFX11-NEXT: s_and_b32 s9, s16, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s17, 8
-; GFX11-NEXT: s_add_i32 s18, s18, 3
-; GFX11-NEXT: s_add_i32 s0, s0, 3
-; GFX11-NEXT: s_add_i32 s2, s2, 3
-; GFX11-NEXT: s_or_b32 s9, s10, s9
-; GFX11-NEXT: s_and_b32 s10, s18, 0xff
-; GFX11-NEXT: s_lshl_b32 s11, s19, 8
-; GFX11-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-NEXT: s_or_b32 s10, s11, s10
-; GFX11-NEXT: s_or_b32 s0, s1, s0
-; GFX11-NEXT: s_or_b32 s1, s3, s2
-; GFX11-NEXT: s_addk_i32 s5, 0x300
-; GFX11-NEXT: s_addk_i32 s6, 0x300
-; GFX11-NEXT: s_addk_i32 s9, 0x300
-; GFX11-NEXT: s_addk_i32 s0, 0x300
-; GFX11-NEXT: s_addk_i32 s1, 0x300
-; GFX11-NEXT: s_addk_i32 s10, 0x300
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v15
-; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v10
-; GFX11-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v16
-; GFX11-NEXT: s_addk_i32 s7, 0x300
-; GFX11-NEXT: s_addk_i32 s8, 0x300
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v12
-; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v18
-; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v17
-; GFX11-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT: v_or_b32_e32 v0, v11, v0
-; GFX11-NEXT: v_or_b32_e32 v2, v21, v2
-; GFX11-NEXT: v_or_b32_e32 v4, v20, v4
-; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT: v_or_b32_e32 v1, v13, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-NEXT: v_or_b32_e32 v3, v9, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-NEXT: v_or_b32_e32 v5, v19, v5
-; GFX11-NEXT: v_or_b32_e32 v6, v14, v6
-; GFX11-NEXT: s_addk_i32 s4, 0x300
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x300, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-NEXT: v_and_b32_e64 v7, 0xffff, s4
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v4, v6, 16, v7
-; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v6, v3, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: .LBB99_3: ; %end
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB99_4:
-; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-; GFX11-NEXT: s_branch .LBB99_2
+; GFX11-TRUE16-LABEL: bitcast_v32i8_to_v16i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v15, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v7
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB99_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
+; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
+; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10
+; GFX11-TRUE16-NEXT: s_and_b32 s9, s28, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s29, 8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v10
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_and_b32 v1, 0xff, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v17
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v18
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v20
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v12
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v13
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v3, v21
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v19
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v14
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s5
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB99_3
+; GFX11-TRUE16-NEXT: .LBB99_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff
+; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s6
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s8
+; GFX11-TRUE16-NEXT: s_and_b32 s9, s16, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s17, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3
+; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3
+; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s9
+; GFX11-TRUE16-NEXT: s_and_b32 s10, s18, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s19, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10
+; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2
+; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v10
+; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v18
+; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v13, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v1
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_and_b32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v20, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v19, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_add_nc_u32 v3, 0x300, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_and_b32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v17
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_and_b32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-TRUE16-NEXT: .LBB99_3: ; %end
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB99_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+; GFX11-TRUE16-NEXT: s_branch .LBB99_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v32i8_to_v16i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v15, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 8, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v7
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB99_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
+; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v15
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v16
+; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: s_and_b32 s11, s28, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s29, 8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v9
+; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e64 v3, 0xffff, s10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v2, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v7, 16, v23
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s8
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB99_3
+; GFX11-FAKE16-NEXT: .LBB99_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff
+; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s24, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s25, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5
+; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s27, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s6
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s7
+; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s8
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s16, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s17, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3
+; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3
+; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s9
+; GFX11-FAKE16-NEXT: s_and_b32 s10, s18, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s19, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s10
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0
+; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2
+; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v15
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v16
+; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v17
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v11, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v21, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v20, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v13, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v9, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v19, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v14, v6
+; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e64 v7, 0xffff, s4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v6, 16, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v5, 16, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v3, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT: .LBB99_3: ; %end
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB99_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+; GFX11-FAKE16-NEXT: s_branch .LBB99_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -34578,192 +35470,369 @@ define inreg <16 x half> @bitcast_v16bf16_to_v16f16_scalar(<16 x bfloat> inreg %
; GFX9-NEXT: v_mov_b32_e32 v7, s23
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v16bf16_to_v16f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s7, s19
-; GFX11-NEXT: s_mov_b32 s6, s18
-; GFX11-NEXT: s_mov_b32 s5, s17
-; GFX11-NEXT: s_mov_b32 s4, s16
-; GFX11-NEXT: s_cmp_lg_u32 s20, 0
-; GFX11-NEXT: s_mov_b32 s8, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB103_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
-; GFX11-NEXT: s_cbranch_vccnz .LBB103_4
-; GFX11-NEXT: .LBB103_2: ; %cmp.true
-; GFX11-NEXT: s_and_b32 s8, s0, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s8
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s1, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v1
-; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s0, s2, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX11-NEXT: s_and_b32 s8, s2, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s1, s4, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s8
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v9, v5
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_bfe_u32 v6, v7, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v2
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s3, 0xffff0000
-; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v3, v3, v4 :: v_dual_add_nc_u32 v4, v6, v7
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: s_lshl_b32 s0, s3, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s4, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v4
-; GFX11-NEXT: v_bfe_u32 v4, v7, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v5, v9 :: v_dual_add_nc_u32 v5, 0x7fff, v10
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: s_and_b32 s0, s5, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v7
-; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v7
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v9, v10, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: s_lshl_b32 s0, s5, 16
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v9, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v5
-; GFX11-NEXT: v_bfe_u32 v5, v12, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v12
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: v_bfe_u32 v13, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v12
-; GFX11-NEXT: s_and_b32 s0, s6, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v14, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v14, v15, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s6, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v13, v13, v6
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v6
-; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v10 :: v_dual_add_nc_u32 v10, v14, v15
-; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s7, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v18, v14, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT: s_and_b32 s0, s7, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v13, v16, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v15
-; GFX11-NEXT: v_bfe_u32 v20, v17, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-NEXT: v_add_nc_u32_e32 v15, v18, v14
-; GFX11-NEXT: v_bfe_u32 v13, v12, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v18, v20, v17
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v10, v10, v16 :: v_dual_add_nc_u32 v15, 0x7fff, v15
-; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v14
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v13, v13, v12
-; GFX11-NEXT: v_bfe_u32 v16, v19, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18
-; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v17
-; GFX11-NEXT: v_cndmask_b32_e32 v14, v15, v21, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
-; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v16, v16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX11-NEXT: v_cndmask_b32_e32 v17, v18, v22, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v16
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v12, v13, v20, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX11-NEXT: v_cndmask_b32_e32 v13, v15, v16, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v17
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_lshl_or_b32 v6, v12, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v10
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v4
-; GFX11-NEXT: v_lshl_or_b32 v7, v13, 16, v15
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v1
-; GFX11-NEXT: v_lshl_or_b32 v4, v17, 16, v10
-; GFX11-NEXT: v_lshl_or_b32 v3, v9, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v2, v11, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v1, v8, 16, v13
-; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v14
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB103_3:
-; GFX11-NEXT: s_branch .LBB103_2
-; GFX11-NEXT: .LBB103_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16bf16_to_v16f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, s19
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, s18
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s17
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s16
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s20, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB103_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB103_4
+; GFX11-TRUE16-NEXT: .LBB103_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s0, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s8
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s1, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s2, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s2, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s5, 0xffff0000
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v6 :: v_dual_add_nc_u32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s8
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v9, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s7, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v6, v2 :: v_dual_add_nc_u32 v2, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s3, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v4, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s3, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v10, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, v18, v17
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s4, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v2, v5
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s4, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v6, v7 :: v_dual_add_nc_u32 v4, 0x7fff, v10
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s5, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v7, v12
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v13, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v10
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v14.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s6, 0xffff0000
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v9.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s6, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v12
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v7
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v15, v10
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s7, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v13.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v15, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v15
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v12, v19, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v20, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v8.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v17
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v20, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v12, v21, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v10.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v12
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v15.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB103_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB103_2
+; GFX11-TRUE16-NEXT: .LBB103_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16bf16_to_v16f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, s19
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, s18
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s17
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s16
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s20, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB103_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB103_4
+; GFX11-FAKE16-NEXT: .LBB103_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_and_b32 s8, s0, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s1, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s2, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s8, s2, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s4, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v9, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v7, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s3, 0xffff0000
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v3, v4 :: v_dual_add_nc_u32 v4, v6, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s3, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s4, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v10, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v7, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v5, v9 :: v_dual_add_nc_u32 v5, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s5, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s5, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v9, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v12, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v12
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v12
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s6, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v15, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s6, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v13, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v5, v10 :: v_dual_add_nc_u32 v10, v14, v15
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s7, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v18, v14, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s7, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v13, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v15
+; GFX11-FAKE16-NEXT: v_bfe_u32 v20, v17, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, v18, v14
+; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v12, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, v20, v17
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v10, v10, v16 :: v_dual_add_nc_u32 v15, 0x7fff, v15
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v14
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v13, v12
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v19, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v17
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v15, v21, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, v16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v18, v22, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v13, v20, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v15, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v12, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v5, 16, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v13, 16, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v17, 16, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v9, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v11, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v8, 16, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v14
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB103_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB103_2
+; GFX11-FAKE16-NEXT: .LBB103_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -37502,177 +38571,351 @@ define inreg <16 x half> @bitcast_v32i8_to_v16f16_scalar(<32 x i8> inreg %a, i32
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX9-NEXT: s_branch .LBB107_2
;
-; GFX11-LABEL: bitcast_v32i8_to_v16f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v0
-; GFX11-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v15, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v14, 8, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v20, 8, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v19, 8, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v7
-; GFX11-NEXT: v_lshlrev_b32_e32 v9, 8, v9
-; GFX11-NEXT: v_lshlrev_b32_e32 v11, 8, v11
-; GFX11-NEXT: v_lshlrev_b32_e32 v13, 8, v13
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB107_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: s_and_b32 s5, s0, 0xff
-; GFX11-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-NEXT: s_and_b32 s7, s2, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s3, 8
-; GFX11-NEXT: s_or_b32 s5, s5, s6
-; GFX11-NEXT: s_or_b32 s6, s7, s8
-; GFX11-NEXT: s_and_b32 s7, s16, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s17, 8
-; GFX11-NEXT: s_and_b32 s9, s18, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s19, 8
-; GFX11-NEXT: s_or_b32 s7, s7, s8
-; GFX11-NEXT: s_or_b32 s8, s9, s10
-; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-NEXT: s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-NEXT: s_and_b32 s9, s22, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s23, 8
-; GFX11-NEXT: s_or_b32 s7, s7, s8
-; GFX11-NEXT: s_or_b32 s8, s9, s10
-; GFX11-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s25, 8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v15
-; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-NEXT: s_or_b32 s8, s9, s10
-; GFX11-NEXT: s_and_b32 s9, s26, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s27, 8
-; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v16
-; GFX11-NEXT: s_or_b32 s9, s9, s10
-; GFX11-NEXT: v_or_b32_e32 v1, v1, v20
-; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v10
-; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v18
-; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v17
-; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v8
-; GFX11-NEXT: v_or_b32_e32 v4, v4, v21
-; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v12
-; GFX11-NEXT: v_or_b32_e32 v2, v2, v19
-; GFX11-NEXT: v_or_b32_e32 v5, v5, v11
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: s_and_b32 s11, s28, 0xff
-; GFX11-NEXT: s_lshl_b32 s12, s29, 8
-; GFX11-NEXT: v_or_b32_e32 v6, v6, v9
-; GFX11-NEXT: s_or_b32 s10, s11, s12
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e64 v3, 0xffff, s10
-; GFX11-NEXT: v_or_b32_e32 v7, v7, v13
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v5
-; GFX11-NEXT: v_lshl_or_b32 v5, v2, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, s6
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v14
-; GFX11-NEXT: v_lshl_or_b32 v6, v6, 16, v22
-; GFX11-NEXT: v_lshl_or_b32 v7, v7, 16, v23
-; GFX11-NEXT: v_mov_b32_e32 v2, s7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v4, v0, 16, v3
-; GFX11-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s8
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB107_3
-; GFX11-NEXT: .LBB107_2: ; %cmp.true
-; GFX11-NEXT: s_add_i32 s28, s28, 3
-; GFX11-NEXT: s_lshl_b32 s5, s29, 8
-; GFX11-NEXT: s_and_b32 s4, s28, 0xff
-; GFX11-NEXT: s_add_i32 s24, s24, 3
-; GFX11-NEXT: s_or_b32 s4, s5, s4
-; GFX11-NEXT: s_and_b32 s5, s24, 0xff
-; GFX11-NEXT: s_lshl_b32 s6, s25, 8
-; GFX11-NEXT: s_add_i32 s26, s26, 3
-; GFX11-NEXT: s_or_b32 s5, s6, s5
-; GFX11-NEXT: s_and_b32 s6, s26, 0xff
-; GFX11-NEXT: s_lshl_b32 s7, s27, 8
-; GFX11-NEXT: s_add_i32 s20, s20, 3
-; GFX11-NEXT: s_or_b32 s6, s7, s6
-; GFX11-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-NEXT: s_add_i32 s22, s22, 3
-; GFX11-NEXT: s_or_b32 s7, s8, s7
-; GFX11-NEXT: s_and_b32 s8, s22, 0xff
-; GFX11-NEXT: s_lshl_b32 s9, s23, 8
-; GFX11-NEXT: s_add_i32 s16, s16, 3
-; GFX11-NEXT: s_or_b32 s8, s9, s8
-; GFX11-NEXT: s_and_b32 s9, s16, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s17, 8
-; GFX11-NEXT: s_add_i32 s18, s18, 3
-; GFX11-NEXT: s_add_i32 s0, s0, 3
-; GFX11-NEXT: s_add_i32 s2, s2, 3
-; GFX11-NEXT: s_or_b32 s9, s10, s9
-; GFX11-NEXT: s_and_b32 s10, s18, 0xff
-; GFX11-NEXT: s_lshl_b32 s11, s19, 8
-; GFX11-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-NEXT: s_or_b32 s10, s11, s10
-; GFX11-NEXT: s_or_b32 s0, s1, s0
-; GFX11-NEXT: s_or_b32 s1, s3, s2
-; GFX11-NEXT: s_addk_i32 s5, 0x300
-; GFX11-NEXT: s_addk_i32 s6, 0x300
-; GFX11-NEXT: s_addk_i32 s9, 0x300
-; GFX11-NEXT: s_addk_i32 s0, 0x300
-; GFX11-NEXT: s_addk_i32 s1, 0x300
-; GFX11-NEXT: s_addk_i32 s10, 0x300
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v15
-; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v10
-; GFX11-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v16
-; GFX11-NEXT: s_addk_i32 s7, 0x300
-; GFX11-NEXT: s_addk_i32 s8, 0x300
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v12
-; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v18
-; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v17
-; GFX11-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT: v_or_b32_e32 v0, v11, v0
-; GFX11-NEXT: v_or_b32_e32 v2, v21, v2
-; GFX11-NEXT: v_or_b32_e32 v4, v20, v4
-; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT: v_or_b32_e32 v1, v13, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-NEXT: v_or_b32_e32 v3, v9, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-NEXT: v_or_b32_e32 v5, v19, v5
-; GFX11-NEXT: v_or_b32_e32 v6, v14, v6
-; GFX11-NEXT: s_addk_i32 s4, 0x300
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x300, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-NEXT: v_and_b32_e64 v7, 0xffff, s4
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v4, v6, 16, v7
-; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v6, v3, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: .LBB107_3: ; %end
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB107_4:
-; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-; GFX11-NEXT: s_branch .LBB107_2
+; GFX11-TRUE16-LABEL: bitcast_v32i8_to_v16f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v15, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v7
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB107_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
+; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
+; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10
+; GFX11-TRUE16-NEXT: s_and_b32 s9, s28, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s29, 8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v10
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_and_b32 v1, 0xff, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v17
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v18
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v20
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v12
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v13
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v3, v21
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v19
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v14
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s5
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB107_3
+; GFX11-TRUE16-NEXT: .LBB107_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff
+; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s6
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s8
+; GFX11-TRUE16-NEXT: s_and_b32 s9, s16, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s17, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3
+; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3
+; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s9
+; GFX11-TRUE16-NEXT: s_and_b32 s10, s18, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s19, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10
+; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2
+; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v10
+; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v18
+; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v13, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v1
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_and_b32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v20, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v19, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_add_nc_u32 v3, 0x300, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_and_b32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v17
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_and_b32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-TRUE16-NEXT: .LBB107_3: ; %end
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB107_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+; GFX11-TRUE16-NEXT: s_branch .LBB107_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v32i8_to_v16f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v15, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 8, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v7
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB107_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
+; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v15
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v16
+; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: s_and_b32 s11, s28, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s29, 8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v9
+; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e64 v3, 0xffff, s10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v2, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v7, 16, v23
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s8
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB107_3
+; GFX11-FAKE16-NEXT: .LBB107_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff
+; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s24, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s25, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5
+; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s27, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s6
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s7
+; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s8
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s16, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s17, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3
+; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3
+; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s9
+; GFX11-FAKE16-NEXT: s_and_b32 s10, s18, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s19, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s10
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0
+; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2
+; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v15
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v16
+; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v17
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v11, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v21, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v20, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v13, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v9, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v19, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v14, v6
+; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e64 v7, 0xffff, s4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v6, 16, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v5, 16, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v3, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT: .LBB107_3: ; %end
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB107_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+; GFX11-FAKE16-NEXT: s_branch .LBB107_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -39594,281 +40837,552 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v28, v32
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v16bf16_to_v32i8_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s20, 0
-; GFX11-NEXT: s_mov_b32 s12, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB109_3
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: s_lshr_b32 s27, s19, 24
-; GFX11-NEXT: s_lshr_b32 s46, s19, 16
-; GFX11-NEXT: s_lshr_b32 s40, s19, 8
-; GFX11-NEXT: s_lshr_b32 s42, s18, 16
-; GFX11-NEXT: s_lshr_b32 s41, s18, 8
-; GFX11-NEXT: s_lshr_b32 s23, s17, 24
-; GFX11-NEXT: s_lshr_b32 s45, s17, 16
-; GFX11-NEXT: s_lshr_b32 s26, s17, 8
-; GFX11-NEXT: s_lshr_b32 s29, s16, 16
-; GFX11-NEXT: s_lshr_b32 s28, s16, 8
-; GFX11-NEXT: s_lshr_b32 s15, s3, 24
-; GFX11-NEXT: s_lshr_b32 s44, s3, 16
-; GFX11-NEXT: s_lshr_b32 s22, s3, 8
-; GFX11-NEXT: s_lshr_b32 s25, s2, 16
-; GFX11-NEXT: s_lshr_b32 s24, s2, 8
-; GFX11-NEXT: s_lshr_b32 s13, s1, 24
-; GFX11-NEXT: s_lshr_b32 s43, s1, 16
-; GFX11-NEXT: s_lshr_b32 s14, s1, 8
-; GFX11-NEXT: s_lshr_b32 s21, s0, 16
-; GFX11-NEXT: s_lshr_b32 s20, s0, 8
-; GFX11-NEXT: s_lshr_b64 s[10:11], s[18:19], 24
-; GFX11-NEXT: s_lshr_b64 s[8:9], s[16:17], 24
-; GFX11-NEXT: s_lshr_b64 s[6:7], s[2:3], 24
-; GFX11-NEXT: s_lshr_b64 s[4:5], s[0:1], 24
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s12
-; GFX11-NEXT: s_cbranch_vccnz .LBB109_4
-; GFX11-NEXT: .LBB109_2: ; %cmp.true
-; GFX11-NEXT: s_lshl_b32 s4, s1, 16
-; GFX11-NEXT: s_and_b32 s1, s1, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
-; GFX11-NEXT: s_and_b32 s4, s0, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1
-; GFX11-NEXT: s_and_b32 s1, s3, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-NEXT: s_and_b32 s0, s2, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v4, v10, v3
-; GFX11-NEXT: v_bfe_u32 v10, v7, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v35
-; GFX11-NEXT: v_bfe_u32 v9, v8, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v0, 0x400000, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v3, v4, v11 :: v_dual_add_nc_u32 v12, 0x7fff, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v10, v7
-; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v9, v8
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s2, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v1, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v4
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s17, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v5
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s17, 16
-; GFX11-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v9
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v34
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v4
-; GFX11-NEXT: v_bfe_u32 v12, v9, 16, 1
-; GFX11-NEXT: v_lshl_or_b32 v10, v14, 16, v7
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s16, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v13, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: s_lshl_b32 s0, s16, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s19, 0xffff0000
-; GFX11-NEXT: v_dual_cndmask_b32 v4, v7, v8 :: v_dual_add_nc_u32 v7, v12, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v13, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_bfe_u32 v13, v15, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v12
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v13, v13, v15
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v12, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v13
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v15
-; GFX11-NEXT: v_bfe_u32 v13, v17, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v16, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v17
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v33
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v4
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s19, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v12, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v13, v17
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s18, 16
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: v_bfe_u32 v18, v13, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s18, 0xffff0000
-; GFX11-NEXT: v_bfe_u32 v12, v4, 16, 1
-; GFX11-NEXT: v_bfe_u32 v20, v16, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v15, v18, v13
-; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v18, v20, v16
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v12, v4
-; GFX11-NEXT: v_bfe_u32 v17, v19, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18
-; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v16
-; GFX11-NEXT: v_cndmask_b32_e32 v13, v15, v21, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
-; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v17, v17, v19
-; GFX11-NEXT: v_cndmask_b32_e32 v16, v18, v23, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v17
-; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v19
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v12, v20, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v7
-; GFX11-NEXT: v_lshl_or_b32 v18, v22, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v12, v15, v17, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v32
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 24, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v12
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v24
-; GFX11-NEXT: v_lshl_or_b32 v26, v30, 16, v4
-; GFX11-NEXT: v_lshl_or_b32 v17, v5, 16, v13
-; GFX11-NEXT: v_lshl_or_b32 v9, v1, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v11
-; GFX11-NEXT: v_lshl_or_b32 v25, v7, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 24, v26
-; GFX11-NEXT: v_lshrrev_b64 v[19:20], 24, v[17:18]
-; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10]
-; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2]
-; GFX11-NEXT: v_lshrrev_b64 v[27:28], 24, v[25:26]
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 8, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 8, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 8, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v17, 8, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v15, 24, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-NEXT: s_branch .LBB109_5
-; GFX11-NEXT: .LBB109_3:
-; GFX11-NEXT: ; implicit-def: $sgpr20
-; GFX11-NEXT: ; implicit-def: $sgpr21
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr14
-; GFX11-NEXT: ; implicit-def: $sgpr43
-; GFX11-NEXT: ; implicit-def: $sgpr13
-; GFX11-NEXT: ; implicit-def: $sgpr24
-; GFX11-NEXT: ; implicit-def: $sgpr25
-; GFX11-NEXT: ; implicit-def: $sgpr6
-; GFX11-NEXT: ; implicit-def: $sgpr22
-; GFX11-NEXT: ; implicit-def: $sgpr44
-; GFX11-NEXT: ; implicit-def: $sgpr15
-; GFX11-NEXT: ; implicit-def: $sgpr28
-; GFX11-NEXT: ; implicit-def: $sgpr29
-; GFX11-NEXT: ; implicit-def: $sgpr8
-; GFX11-NEXT: ; implicit-def: $sgpr26
-; GFX11-NEXT: ; implicit-def: $sgpr45
-; GFX11-NEXT: ; implicit-def: $sgpr23
-; GFX11-NEXT: ; implicit-def: $sgpr41
-; GFX11-NEXT: ; implicit-def: $sgpr42
-; GFX11-NEXT: ; implicit-def: $sgpr10
-; GFX11-NEXT: ; implicit-def: $sgpr40
-; GFX11-NEXT: ; implicit-def: $sgpr46
-; GFX11-NEXT: ; implicit-def: $sgpr27
-; GFX11-NEXT: s_branch .LBB109_2
-; GFX11-NEXT: .LBB109_4:
-; GFX11-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v33, s17
-; GFX11-NEXT: v_dual_mov_b32 v32, s19 :: v_dual_mov_b32 v35, s1
-; GFX11-NEXT: v_dual_mov_b32 v30, s46 :: v_dual_mov_b32 v25, s41
-; GFX11-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v31, s27
-; GFX11-NEXT: v_dual_mov_b32 v22, s45 :: v_dual_mov_b32 v29, s40
-; GFX11-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v17, s28
-; GFX11-NEXT: v_dual_mov_b32 v34, s3 :: v_dual_mov_b32 v23, s23
-; GFX11-NEXT: v_dual_mov_b32 v14, s44 :: v_dual_mov_b32 v21, s26
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v9, s24
-; GFX11-NEXT: v_dual_mov_b32 v6, s43 :: v_dual_mov_b32 v15, s15
-; GFX11-NEXT: v_dual_mov_b32 v26, s42 :: v_dual_mov_b32 v13, s22
-; GFX11-NEXT: v_dual_mov_b32 v18, s29 :: v_dual_mov_b32 v1, s20
-; GFX11-NEXT: v_dual_mov_b32 v10, s25 :: v_dual_mov_b32 v7, s13
-; GFX11-NEXT: v_dual_mov_b32 v2, s21 :: v_dual_mov_b32 v5, s14
-; GFX11-NEXT: v_mov_b32_e32 v27, s10
-; GFX11-NEXT: v_mov_b32_e32 v19, s8
-; GFX11-NEXT: v_mov_b32_e32 v11, s6
-; GFX11-NEXT: v_mov_b32_e32 v3, s4
-; GFX11-NEXT: .LBB109_5: ; %end
-; GFX11-NEXT: v_mov_b32_e32 v4, v35
-; GFX11-NEXT: v_mov_b32_e32 v12, v34
-; GFX11-NEXT: v_mov_b32_e32 v20, v33
-; GFX11-NEXT: v_mov_b32_e32 v28, v32
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16bf16_to_v32i8_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s20, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s12, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB109_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s19, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s19, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s18, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s17, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s17, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s16, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s3, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s3, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s2, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s1, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s1, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s21, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s20, s0, 8
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s12
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB109_4
+; GFX11-TRUE16-NEXT: .LBB109_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s1, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s0, 0xffff0000
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s3, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s2, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v9, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v10, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s18, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v10, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v19, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v6.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, v20, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s2, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s17, 0xffff0000
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s17, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v4
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v3, v9 :: v_dual_add_nc_u32 v4, v5, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s16, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v34.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v3, v4 :: v_dual_add_nc_u32 v4, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v13, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v11.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v8, v12
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s16, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, v16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s19, 0xffff0000
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v7, v7, v12 :: v_dual_add_nc_u32 v12, 0x7fff, v15
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v22.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v12, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v15, v5
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s19, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s18, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, v13, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v12, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v16, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
+; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v17, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v12, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, v15, v17
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v13, v21, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v33.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v30.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v20, v21, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v32.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v13
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v15, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[17:18]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v24.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v5.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[27:28], 24, v[25:26]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-TRUE16-NEXT: s_branch .LBB109_5
+; GFX11-TRUE16-NEXT: .LBB109_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr20
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr21
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr24
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr25
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr22
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr26
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr23
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr27
+; GFX11-TRUE16-NEXT: s_branch .LBB109_2
+; GFX11-TRUE16-NEXT: .LBB109_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v33, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s19 :: v_dual_mov_b32 v35, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s46 :: v_dual_mov_b32 v25, s41
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v31, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s45 :: v_dual_mov_b32 v29, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v17, s28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s3 :: v_dual_mov_b32 v23, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s44 :: v_dual_mov_b32 v21, s26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v9, s24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s43 :: v_dual_mov_b32 v15, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s42 :: v_dual_mov_b32 v13, s22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s29 :: v_dual_mov_b32 v1, s20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s25 :: v_dual_mov_b32 v7, s13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s21 :: v_dual_mov_b32 v5, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v27, s10
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v19, s8
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s6
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4
+; GFX11-TRUE16-NEXT: .LBB109_5: ; %end
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v35
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v12, v34
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v20, v33
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v28, v32
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16bf16_to_v32i8_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s20, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s12, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB109_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s19, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s19, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s19, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s18, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s18, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s17, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s17, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s17, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s29, s16, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s16, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s3, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s3, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s3, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s2, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s2, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s1, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s1, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s1, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s21, s0, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s20, s0, 8
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s12
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB109_4
+; GFX11-FAKE16-NEXT: .LBB109_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s1, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s0, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v3, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s3, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s2, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v4, v10, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v35
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, 0x400000, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v4, v11 :: v_dual_add_nc_u32 v12, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v10, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v6, 16, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v9, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s2, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s17, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s17, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v4, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v34
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v14, 16, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s16, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s19, 0xffff0000
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v7, v8 :: v_dual_add_nc_u32 v7, v12, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v13, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v15, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v13, v15
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v13
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v15
+; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v17, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v17
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v33
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s19, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v13, v17
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s18, 16
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v18, v13, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s18, 0xffff0000
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v20, v16, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, v18, v13
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, v20, v16
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v12, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v19, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v15, v21, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, v17, v19
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v18, v23, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v17
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v19
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v12, v20, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v22, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v15, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v32
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 24, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v30, 16, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v5, 16, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v1, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v7, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 24, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[17:18]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2]
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[27:28], 24, v[25:26]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 8, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 8, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 8, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-FAKE16-NEXT: s_branch .LBB109_5
+; GFX11-FAKE16-NEXT: .LBB109_3:
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr20
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr21
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr22
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr23
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr27
+; GFX11-FAKE16-NEXT: s_branch .LBB109_2
+; GFX11-FAKE16-NEXT: .LBB109_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v33, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s19 :: v_dual_mov_b32 v35, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s46 :: v_dual_mov_b32 v25, s41
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v31, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s45 :: v_dual_mov_b32 v29, s40
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v17, s28
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s3 :: v_dual_mov_b32 v23, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s44 :: v_dual_mov_b32 v21, s26
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v9, s24
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s43 :: v_dual_mov_b32 v15, s15
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s42 :: v_dual_mov_b32 v13, s22
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s29 :: v_dual_mov_b32 v1, s20
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s25 :: v_dual_mov_b32 v7, s13
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s21 :: v_dual_mov_b32 v5, s14
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v27, s10
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v19, s8
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, s6
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4
+; GFX11-FAKE16-NEXT: .LBB109_5: ; %end
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v35
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, v34
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v20, v33
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v28, v32
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -41383,177 +42897,351 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX9-NEXT: s_branch .LBB111_2
;
-; GFX11-LABEL: bitcast_v32i8_to_v16bf16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v0
-; GFX11-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v15, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v14, 8, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v20, 8, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v19, 8, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v7
-; GFX11-NEXT: v_lshlrev_b32_e32 v9, 8, v9
-; GFX11-NEXT: v_lshlrev_b32_e32 v11, 8, v11
-; GFX11-NEXT: v_lshlrev_b32_e32 v13, 8, v13
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB111_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: s_and_b32 s5, s0, 0xff
-; GFX11-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-NEXT: s_and_b32 s7, s2, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s3, 8
-; GFX11-NEXT: s_or_b32 s5, s5, s6
-; GFX11-NEXT: s_or_b32 s6, s7, s8
-; GFX11-NEXT: s_and_b32 s7, s16, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s17, 8
-; GFX11-NEXT: s_and_b32 s9, s18, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s19, 8
-; GFX11-NEXT: s_or_b32 s7, s7, s8
-; GFX11-NEXT: s_or_b32 s8, s9, s10
-; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-NEXT: s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-NEXT: s_and_b32 s9, s22, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s23, 8
-; GFX11-NEXT: s_or_b32 s7, s7, s8
-; GFX11-NEXT: s_or_b32 s8, s9, s10
-; GFX11-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s25, 8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v15
-; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-NEXT: s_or_b32 s8, s9, s10
-; GFX11-NEXT: s_and_b32 s9, s26, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s27, 8
-; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v16
-; GFX11-NEXT: s_or_b32 s9, s9, s10
-; GFX11-NEXT: v_or_b32_e32 v1, v1, v20
-; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v10
-; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v18
-; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v17
-; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v8
-; GFX11-NEXT: v_or_b32_e32 v4, v4, v21
-; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v12
-; GFX11-NEXT: v_or_b32_e32 v2, v2, v19
-; GFX11-NEXT: v_or_b32_e32 v5, v5, v11
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: s_and_b32 s11, s28, 0xff
-; GFX11-NEXT: s_lshl_b32 s12, s29, 8
-; GFX11-NEXT: v_or_b32_e32 v6, v6, v9
-; GFX11-NEXT: s_or_b32 s10, s11, s12
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e64 v3, 0xffff, s10
-; GFX11-NEXT: v_or_b32_e32 v7, v7, v13
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v5
-; GFX11-NEXT: v_lshl_or_b32 v5, v2, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, s6
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v14
-; GFX11-NEXT: v_lshl_or_b32 v6, v6, 16, v22
-; GFX11-NEXT: v_lshl_or_b32 v7, v7, 16, v23
-; GFX11-NEXT: v_mov_b32_e32 v2, s7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v4, v0, 16, v3
-; GFX11-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s8
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB111_3
-; GFX11-NEXT: .LBB111_2: ; %cmp.true
-; GFX11-NEXT: s_add_i32 s28, s28, 3
-; GFX11-NEXT: s_lshl_b32 s5, s29, 8
-; GFX11-NEXT: s_and_b32 s4, s28, 0xff
-; GFX11-NEXT: s_add_i32 s24, s24, 3
-; GFX11-NEXT: s_or_b32 s4, s5, s4
-; GFX11-NEXT: s_and_b32 s5, s24, 0xff
-; GFX11-NEXT: s_lshl_b32 s6, s25, 8
-; GFX11-NEXT: s_add_i32 s26, s26, 3
-; GFX11-NEXT: s_or_b32 s5, s6, s5
-; GFX11-NEXT: s_and_b32 s6, s26, 0xff
-; GFX11-NEXT: s_lshl_b32 s7, s27, 8
-; GFX11-NEXT: s_add_i32 s20, s20, 3
-; GFX11-NEXT: s_or_b32 s6, s7, s6
-; GFX11-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-NEXT: s_add_i32 s22, s22, 3
-; GFX11-NEXT: s_or_b32 s7, s8, s7
-; GFX11-NEXT: s_and_b32 s8, s22, 0xff
-; GFX11-NEXT: s_lshl_b32 s9, s23, 8
-; GFX11-NEXT: s_add_i32 s16, s16, 3
-; GFX11-NEXT: s_or_b32 s8, s9, s8
-; GFX11-NEXT: s_and_b32 s9, s16, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s17, 8
-; GFX11-NEXT: s_add_i32 s18, s18, 3
-; GFX11-NEXT: s_add_i32 s0, s0, 3
-; GFX11-NEXT: s_add_i32 s2, s2, 3
-; GFX11-NEXT: s_or_b32 s9, s10, s9
-; GFX11-NEXT: s_and_b32 s10, s18, 0xff
-; GFX11-NEXT: s_lshl_b32 s11, s19, 8
-; GFX11-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-NEXT: s_or_b32 s10, s11, s10
-; GFX11-NEXT: s_or_b32 s0, s1, s0
-; GFX11-NEXT: s_or_b32 s1, s3, s2
-; GFX11-NEXT: s_addk_i32 s5, 0x300
-; GFX11-NEXT: s_addk_i32 s6, 0x300
-; GFX11-NEXT: s_addk_i32 s9, 0x300
-; GFX11-NEXT: s_addk_i32 s0, 0x300
-; GFX11-NEXT: s_addk_i32 s1, 0x300
-; GFX11-NEXT: s_addk_i32 s10, 0x300
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v15
-; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v10
-; GFX11-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v16
-; GFX11-NEXT: s_addk_i32 s7, 0x300
-; GFX11-NEXT: s_addk_i32 s8, 0x300
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v12
-; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v18
-; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v17
-; GFX11-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT: v_or_b32_e32 v0, v11, v0
-; GFX11-NEXT: v_or_b32_e32 v2, v21, v2
-; GFX11-NEXT: v_or_b32_e32 v4, v20, v4
-; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT: v_or_b32_e32 v1, v13, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-NEXT: v_or_b32_e32 v3, v9, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-NEXT: v_or_b32_e32 v5, v19, v5
-; GFX11-NEXT: v_or_b32_e32 v6, v14, v6
-; GFX11-NEXT: s_addk_i32 s4, 0x300
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x300, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-NEXT: v_and_b32_e64 v7, 0xffff, s4
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v4, v6, 16, v7
-; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v6, v3, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: .LBB111_3: ; %end
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB111_4:
-; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-; GFX11-NEXT: s_branch .LBB111_2
+; GFX11-TRUE16-LABEL: bitcast_v32i8_to_v16bf16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v15, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v7
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB111_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
+; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
+; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10
+; GFX11-TRUE16-NEXT: s_and_b32 s9, s28, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s29, 8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v10
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_and_b32 v1, 0xff, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v17
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v18
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v20
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v12
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v13
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v3, v21
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v19
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v14
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s5
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB111_3
+; GFX11-TRUE16-NEXT: .LBB111_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff
+; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s6
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s8
+; GFX11-TRUE16-NEXT: s_and_b32 s9, s16, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s17, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3
+; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3
+; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s9
+; GFX11-TRUE16-NEXT: s_and_b32 s10, s18, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s19, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10
+; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2
+; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v10
+; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v18
+; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v13, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v1
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_and_b32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v20, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v19, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_add_nc_u32 v3, 0x300, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_and_b32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v17
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_and_b32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-TRUE16-NEXT: .LBB111_3: ; %end
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB111_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+; GFX11-TRUE16-NEXT: s_branch .LBB111_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v32i8_to_v16bf16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v15, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 8, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v7
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB111_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
+; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v15
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v16
+; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: s_and_b32 s11, s28, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s29, 8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v9
+; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e64 v3, 0xffff, s10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v2, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v7, 16, v23
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s8
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB111_3
+; GFX11-FAKE16-NEXT: .LBB111_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff
+; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s24, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s25, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5
+; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s27, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s6
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s7
+; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s8
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s16, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s17, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3
+; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3
+; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s9
+; GFX11-FAKE16-NEXT: s_and_b32 s10, s18, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s19, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s10
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0
+; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2
+; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v15
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v16
+; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v17
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v11, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v21, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v20, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v13, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v9, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v19, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v14, v6
+; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e64 v7, 0xffff, s4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v6, 16, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v5, 16, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v3, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT: .LBB111_3: ; %end
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB111_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+; GFX11-FAKE16-NEXT: s_branch .LBB111_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
index 3aaf254..9041f64 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
@@ -3044,91 +3044,66 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v27.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v28.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v13.l
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v19.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v14.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v13.l
; GFX11-TRUE16-NEXT: s_clause 0x2
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v10i32_to_v40i8:
@@ -5025,39 +5000,41 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v25.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v17.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v3.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v5.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v35.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v30.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v29.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v27.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v18.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v28.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v27.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v33.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v34.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v35.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v33.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v33.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v35.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_3
@@ -5071,63 +5048,53 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB14_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v26.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v0.l, v24.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v27.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v19.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v27, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v1.h, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v15.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v21.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v27, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v2.l, v20.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v27.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v15.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v3.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v14.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v27, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v4.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v27, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v5.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v27, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v6.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v23.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v24.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v14.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v12.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
@@ -5140,147 +5107,110 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v10.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v27, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v7.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v27.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v27, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v8.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v27.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v27, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v9.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v27.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v27, v9
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2
; GFX11-TRUE16-NEXT: .LBB14_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v26.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v25.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v22.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v21.h, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v25.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v21.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v20.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v19.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v15.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v15.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v31.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v31.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v24.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v25.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v23.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v23.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v19.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v19.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v17.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v25, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v16.h, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v20.h, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v21.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v25.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v17.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v18.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v25, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v15.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v15.h, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v25, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v14.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v14.h, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v25, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v13.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v13.h, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v25, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v12.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v12.h, v6.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v25, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v31.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v11.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v25, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.h, v8.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v25, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v21.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v22.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v23.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v23.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v24.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v16.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v17.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v17.h, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v18.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v19.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v13.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v13.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v14.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v14.h, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v10.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v10.h, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v11.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v11.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v12.l, v9.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v25, v9
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -9991,91 +9921,66 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v27.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v28.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v13.l
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v19.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v14.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v13.l
; GFX11-TRUE16-NEXT: s_clause 0x2
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v10f32_to_v40i8:
@@ -11997,39 +11902,41 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v25.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v17.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v3.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v5.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v35.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v30.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v29.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v27.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v18.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v28.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v27.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v33.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v34.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v35.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v33.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v33.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v35.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_3
@@ -12043,63 +11950,53 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB34_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v26.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v0.l, v24.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v27.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v19.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v27, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v1.h, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v15.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v21.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v27, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v2.l, v20.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v27.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v15.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v3.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v14.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v27, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v4.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v27, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v5.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v27, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v6.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v23.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v24.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v14.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v12.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
@@ -12112,147 +12009,110 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v10.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v27, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v7.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v27.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v27, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v8.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v27.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v27, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v9.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v27.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v27, v9
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_2
; GFX11-TRUE16-NEXT: .LBB34_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v26.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v25.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v22.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v21.h, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v25.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v21.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v20.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v19.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v15.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v15.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v31.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v31.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v24.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v25.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v23.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v23.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v19.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v19.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v17.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v25, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v16.h, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v20.h, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v21.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v25.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v17.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v18.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v25, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v15.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v15.h, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v25, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v14.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v14.h, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v25, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v13.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v13.h, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v25, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v12.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v12.h, v6.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v25, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v31.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v11.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v25, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.h, v8.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v25, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v21.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v22.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v23.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v23.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v24.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v16.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v17.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v17.h, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v18.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v19.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v13.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v13.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v14.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v14.h, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v10.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v10.h, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v11.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v11.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v12.l, v9.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v25, v9
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -16367,91 +16227,66 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v27.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v28.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v13.l
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v19.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v14.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v13.l
; GFX11-TRUE16-NEXT: s_clause 0x2
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v20i16_to_v40i8:
@@ -19727,212 +19562,421 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX9-NEXT: s_branch .LBB51_2
;
-; GFX11-LABEL: bitcast_v40i8_to_v20i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
-; GFX11-NEXT: v_dual_mov_b32 v25, v14 :: v_dual_mov_b32 v28, v12
-; GFX11-NEXT: v_dual_mov_b32 v27, v10 :: v_dual_mov_b32 v26, v8
-; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v0
-; GFX11-NEXT: v_dual_mov_b32 v30, v4 :: v_dual_mov_b32 v29, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v22, 8, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v32, 8, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v31, 8, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v34, 8, v7
-; GFX11-NEXT: v_lshlrev_b32_e32 v33, 8, v9
-; GFX11-NEXT: v_lshlrev_b32_e32 v36, 8, v11
-; GFX11-NEXT: v_lshlrev_b32_e32 v35, 8, v13
-; GFX11-NEXT: v_lshlrev_b32_e32 v37, 8, v15
-; GFX11-NEXT: v_lshlrev_b32_e32 v17, 8, v17
-; GFX11-NEXT: v_lshlrev_b32_e32 v19, 8, v19
-; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v21
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB51_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: s_and_b32 s5, s0, 0xff
-; GFX11-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-NEXT: s_and_b32 s7, s2, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s3, 8
-; GFX11-NEXT: s_or_b32 s5, s5, s6
-; GFX11-NEXT: s_or_b32 s6, s7, s8
-; GFX11-NEXT: s_and_b32 s7, s16, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s17, 8
-; GFX11-NEXT: s_and_b32 s9, s18, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s19, 8
-; GFX11-NEXT: s_or_b32 s7, s7, s8
-; GFX11-NEXT: s_or_b32 s8, s9, s10
-; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-NEXT: s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-NEXT: s_and_b32 s9, s22, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s23, 8
-; GFX11-NEXT: s_or_b32 s7, s7, s8
-; GFX11-NEXT: s_or_b32 s8, s9, s10
-; GFX11-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s25, 8
-; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v23
-; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-NEXT: s_or_b32 s8, s9, s10
-; GFX11-NEXT: s_and_b32 s9, s26, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s27, 8
-; GFX11-NEXT: s_and_b32 s11, s28, 0xff
-; GFX11-NEXT: s_lshl_b32 s12, s29, 8
-; GFX11-NEXT: s_or_b32 s9, s9, s10
-; GFX11-NEXT: s_or_b32 s10, s11, s12
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v22
-; GFX11-NEXT: v_and_b32_e64 v2, 0xffff, s10
-; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v30
-; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v24
-; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v28
-; GFX11-NEXT: v_lshl_or_b32 v4, v0, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v26
-; GFX11-NEXT: v_or_b32_e32 v2, v3, v31
-; GFX11-NEXT: v_or_b32_e32 v3, v5, v34
-; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v29
-; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v27
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v33
-; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v25
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v18
-; GFX11-NEXT: v_or_b32_e32 v9, v6, v35
-; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v16
-; GFX11-NEXT: v_or_b32_e32 v5, v5, v36
-; GFX11-NEXT: v_or_b32_e32 v7, v7, v37
-; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v20
-; GFX11-NEXT: v_or_b32_e32 v8, v8, v19
-; GFX11-NEXT: v_or_b32_e32 v12, v6, v17
-; GFX11-NEXT: v_lshl_or_b32 v6, v0, 16, v3
-; GFX11-NEXT: v_mov_b32_e32 v0, s5
-; GFX11-NEXT: v_or_b32_e32 v1, v1, v32
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v7
-; GFX11-NEXT: v_or_b32_e32 v10, v10, v21
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshl_or_b32 v7, v9, 16, v11
-; GFX11-NEXT: v_lshl_or_b32 v8, v12, 16, v13
-; GFX11-NEXT: v_mov_b32_e32 v3, s8
-; GFX11-NEXT: v_lshl_or_b32 v9, v10, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v5, v2, 16, v1
-; GFX11-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB51_3
-; GFX11-NEXT: .LBB51_2: ; %cmp.true
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v27
-; GFX11-NEXT: s_add_i32 s28, s28, 3
-; GFX11-NEXT: s_lshl_b32 s5, s29, 8
-; GFX11-NEXT: s_and_b32 s4, s28, 0xff
-; GFX11-NEXT: s_add_i32 s24, s24, 3
-; GFX11-NEXT: s_or_b32 s4, s5, s4
-; GFX11-NEXT: s_and_b32 s5, s24, 0xff
-; GFX11-NEXT: s_lshl_b32 s6, s25, 8
-; GFX11-NEXT: s_add_i32 s26, s26, 3
-; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT: s_or_b32 s5, s6, s5
-; GFX11-NEXT: s_and_b32 s6, s26, 0xff
-; GFX11-NEXT: s_lshl_b32 s7, s27, 8
-; GFX11-NEXT: s_add_i32 s20, s20, 3
-; GFX11-NEXT: s_or_b32 s6, s7, s6
-; GFX11-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-NEXT: s_add_i32 s22, s22, 3
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v28
-; GFX11-NEXT: v_or_b32_e32 v4, v36, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v26
-; GFX11-NEXT: s_or_b32 s7, s8, s7
-; GFX11-NEXT: s_and_b32 s8, s22, 0xff
-; GFX11-NEXT: s_lshl_b32 s9, s23, 8
-; GFX11-NEXT: s_add_i32 s16, s16, 3
-; GFX11-NEXT: s_or_b32 s8, s9, s8
-; GFX11-NEXT: s_and_b32 s9, s16, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s17, 8
-; GFX11-NEXT: s_add_i32 s18, s18, 3
-; GFX11-NEXT: s_add_i32 s0, s0, 3
-; GFX11-NEXT: s_add_i32 s2, s2, 3
-; GFX11-NEXT: s_or_b32 s9, s10, s9
-; GFX11-NEXT: s_and_b32 s10, s18, 0xff
-; GFX11-NEXT: s_lshl_b32 s11, s19, 8
-; GFX11-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v24
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x300, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v7
-; GFX11-NEXT: s_or_b32 s10, s11, s10
-; GFX11-NEXT: s_or_b32 s0, s1, s0
-; GFX11-NEXT: s_or_b32 s1, s3, s2
-; GFX11-NEXT: s_addk_i32 s5, 0x300
-; GFX11-NEXT: s_addk_i32 s6, 0x300
-; GFX11-NEXT: s_addk_i32 s9, 0x300
-; GFX11-NEXT: s_addk_i32 s10, 0x300
-; GFX11-NEXT: s_addk_i32 s0, 0x300
-; GFX11-NEXT: s_addk_i32 s1, 0x300
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v29
-; GFX11-NEXT: v_or_b32_e32 v5, v35, v5
-; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT: v_or_b32_e32 v4, v33, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v23
-; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v18
-; GFX11-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v25
-; GFX11-NEXT: s_addk_i32 s7, 0x300
-; GFX11-NEXT: s_addk_i32 s8, 0x300
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v20
-; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x300, v5
-; GFX11-NEXT: v_or_b32_e32 v5, v34, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v30
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v10
-; GFX11-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v16
-; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT: v_or_b32_e32 v0, v19, v0
-; GFX11-NEXT: v_or_b32_e32 v2, v37, v2
-; GFX11-NEXT: v_or_b32_e32 v7, v32, v7
-; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT: v_or_b32_e32 v4, v22, v4
-; GFX11-NEXT: s_addk_i32 s4, 0x300
-; GFX11-NEXT: v_or_b32_e32 v1, v21, v1
-; GFX11-NEXT: v_or_b32_e32 v3, v17, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x300, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-NEXT: v_or_b32_e32 v6, v31, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-NEXT: v_and_b32_e64 v10, 0xffff, s4
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT: v_lshl_or_b32 v4, v4, 16, v10
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v5, v6, 16, v7
-; GFX11-NEXT: v_lshl_or_b32 v6, v11, 16, v10
-; GFX11-NEXT: v_lshl_or_b32 v7, v8, 16, v9
-; GFX11-NEXT: v_lshl_or_b32 v8, v3, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v9, v1, 16, v0
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: .LBB51_3: ; %end
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB51_4:
-; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-NEXT: s_branch .LBB51_2
+; GFX11-TRUE16-LABEL: bitcast_v40i8_to_v20i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v14 :: v_dual_mov_b32 v28, v12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v10 :: v_dual_mov_b32 v26, v8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v4 :: v_dual_mov_b32 v29, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 8, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 8, v7
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 8, v11
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v13
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v15
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v19
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v21
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
+; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s17, 8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-TRUE16-NEXT: s_and_b32 s6, s16, 0xff
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s18, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s19, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7
+; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s20, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s21, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s10, s22, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s23, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s8, s9
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s24, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s25, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s10, s26, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s27, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
+; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_and_b32 v1, 0xff, v30
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v29
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v18
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v23
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v26
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v2, v32
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v19
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v27
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v1, v36
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v20
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v21
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v22
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v34
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v28
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v35
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v33
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v25
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v3, v37
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v17
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3
+; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff
+; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3
+; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3
+; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2
+; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v25
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v27
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v26
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v29
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v37, v2
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v17, v3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v36, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v24
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v30
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s8
+; GFX11-TRUE16-NEXT: s_and_b32 s9, s16, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s17, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s9
+; GFX11-TRUE16-NEXT: s_and_b32 s10, s18, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s19, 8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v33, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10
+; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v10
+; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v20
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v18
+; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v34, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v19, v0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v23
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v32, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v10.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v22, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v35, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-TRUE16-NEXT: .LBB51_3: ; %end
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB51_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-TRUE16-NEXT: s_branch .LBB51_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v40i8_to_v20i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, v14 :: v_dual_mov_b32 v28, v12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, v10 :: v_dual_mov_b32 v26, v8
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, v4 :: v_dual_mov_b32 v29, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 8, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 8, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 8, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 8, v7
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 8, v9
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 8, v11
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 8, v13
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 8, v15
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v19
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v21
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB51_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
+; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v23
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s11, s28, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s29, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10
+; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e64 v2, 0xffff, s10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v30
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v24
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v26
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v3, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v5, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v27
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v6, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v36
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v20
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v6, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v0, 16, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v32
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v9, 16, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v12, 16, v13
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v10, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v2, 16, v1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB51_3
+; GFX11-FAKE16-NEXT: .LBB51_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v27
+; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff
+; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s24, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s25, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5
+; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s27, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s6
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v28
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v36, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v26
+; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s7
+; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s8
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s16, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s17, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3
+; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3
+; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s9
+; GFX11-FAKE16-NEXT: s_and_b32 s10, s18, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s19, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v24
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v7
+; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s10
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0
+; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2
+; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v29
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v35, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v33, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v23
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v18
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v25
+; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v34, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v30
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v19, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v37, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v32, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v22, v4
+; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v21, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v17, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v31, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e64 v10, 0xffff, s4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v6, 16, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v11, 16, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v8, 16, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v3, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT: .LBB51_3: ; %end
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB51_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-FAKE16-NEXT: s_branch .LBB51_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -22484,91 +22528,66 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v27.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v28.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v13.l
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v19.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v14.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v13.l
; GFX11-TRUE16-NEXT: s_clause 0x2
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v20f16_to_v40i8:
@@ -25790,212 +25809,421 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX9-NEXT: s_branch .LBB63_2
;
-; GFX11-LABEL: bitcast_v40i8_to_v20f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
-; GFX11-NEXT: v_dual_mov_b32 v25, v14 :: v_dual_mov_b32 v28, v12
-; GFX11-NEXT: v_dual_mov_b32 v27, v10 :: v_dual_mov_b32 v26, v8
-; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v0
-; GFX11-NEXT: v_dual_mov_b32 v30, v4 :: v_dual_mov_b32 v29, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v22, 8, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v32, 8, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v31, 8, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v34, 8, v7
-; GFX11-NEXT: v_lshlrev_b32_e32 v33, 8, v9
-; GFX11-NEXT: v_lshlrev_b32_e32 v36, 8, v11
-; GFX11-NEXT: v_lshlrev_b32_e32 v35, 8, v13
-; GFX11-NEXT: v_lshlrev_b32_e32 v37, 8, v15
-; GFX11-NEXT: v_lshlrev_b32_e32 v17, 8, v17
-; GFX11-NEXT: v_lshlrev_b32_e32 v19, 8, v19
-; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v21
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB63_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: s_and_b32 s5, s0, 0xff
-; GFX11-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-NEXT: s_and_b32 s7, s2, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s3, 8
-; GFX11-NEXT: s_or_b32 s5, s5, s6
-; GFX11-NEXT: s_or_b32 s6, s7, s8
-; GFX11-NEXT: s_and_b32 s7, s16, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s17, 8
-; GFX11-NEXT: s_and_b32 s9, s18, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s19, 8
-; GFX11-NEXT: s_or_b32 s7, s7, s8
-; GFX11-NEXT: s_or_b32 s8, s9, s10
-; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-NEXT: s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-NEXT: s_and_b32 s9, s22, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s23, 8
-; GFX11-NEXT: s_or_b32 s7, s7, s8
-; GFX11-NEXT: s_or_b32 s8, s9, s10
-; GFX11-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s25, 8
-; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v23
-; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-NEXT: s_or_b32 s8, s9, s10
-; GFX11-NEXT: s_and_b32 s9, s26, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s27, 8
-; GFX11-NEXT: s_and_b32 s11, s28, 0xff
-; GFX11-NEXT: s_lshl_b32 s12, s29, 8
-; GFX11-NEXT: s_or_b32 s9, s9, s10
-; GFX11-NEXT: s_or_b32 s10, s11, s12
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v22
-; GFX11-NEXT: v_and_b32_e64 v2, 0xffff, s10
-; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v30
-; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v24
-; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v28
-; GFX11-NEXT: v_lshl_or_b32 v4, v0, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v26
-; GFX11-NEXT: v_or_b32_e32 v2, v3, v31
-; GFX11-NEXT: v_or_b32_e32 v3, v5, v34
-; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v29
-; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v27
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v33
-; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v25
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v18
-; GFX11-NEXT: v_or_b32_e32 v9, v6, v35
-; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v16
-; GFX11-NEXT: v_or_b32_e32 v5, v5, v36
-; GFX11-NEXT: v_or_b32_e32 v7, v7, v37
-; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v20
-; GFX11-NEXT: v_or_b32_e32 v8, v8, v19
-; GFX11-NEXT: v_or_b32_e32 v12, v6, v17
-; GFX11-NEXT: v_lshl_or_b32 v6, v0, 16, v3
-; GFX11-NEXT: v_mov_b32_e32 v0, s5
-; GFX11-NEXT: v_or_b32_e32 v1, v1, v32
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v7
-; GFX11-NEXT: v_or_b32_e32 v10, v10, v21
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshl_or_b32 v7, v9, 16, v11
-; GFX11-NEXT: v_lshl_or_b32 v8, v12, 16, v13
-; GFX11-NEXT: v_mov_b32_e32 v3, s8
-; GFX11-NEXT: v_lshl_or_b32 v9, v10, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v5, v2, 16, v1
-; GFX11-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB63_3
-; GFX11-NEXT: .LBB63_2: ; %cmp.true
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v27
-; GFX11-NEXT: s_add_i32 s28, s28, 3
-; GFX11-NEXT: s_lshl_b32 s5, s29, 8
-; GFX11-NEXT: s_and_b32 s4, s28, 0xff
-; GFX11-NEXT: s_add_i32 s24, s24, 3
-; GFX11-NEXT: s_or_b32 s4, s5, s4
-; GFX11-NEXT: s_and_b32 s5, s24, 0xff
-; GFX11-NEXT: s_lshl_b32 s6, s25, 8
-; GFX11-NEXT: s_add_i32 s26, s26, 3
-; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT: s_or_b32 s5, s6, s5
-; GFX11-NEXT: s_and_b32 s6, s26, 0xff
-; GFX11-NEXT: s_lshl_b32 s7, s27, 8
-; GFX11-NEXT: s_add_i32 s20, s20, 3
-; GFX11-NEXT: s_or_b32 s6, s7, s6
-; GFX11-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-NEXT: s_add_i32 s22, s22, 3
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v28
-; GFX11-NEXT: v_or_b32_e32 v4, v36, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v26
-; GFX11-NEXT: s_or_b32 s7, s8, s7
-; GFX11-NEXT: s_and_b32 s8, s22, 0xff
-; GFX11-NEXT: s_lshl_b32 s9, s23, 8
-; GFX11-NEXT: s_add_i32 s16, s16, 3
-; GFX11-NEXT: s_or_b32 s8, s9, s8
-; GFX11-NEXT: s_and_b32 s9, s16, 0xff
-; GFX11-NEXT: s_lshl_b32 s10, s17, 8
-; GFX11-NEXT: s_add_i32 s18, s18, 3
-; GFX11-NEXT: s_add_i32 s0, s0, 3
-; GFX11-NEXT: s_add_i32 s2, s2, 3
-; GFX11-NEXT: s_or_b32 s9, s10, s9
-; GFX11-NEXT: s_and_b32 s10, s18, 0xff
-; GFX11-NEXT: s_lshl_b32 s11, s19, 8
-; GFX11-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v24
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x300, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v7
-; GFX11-NEXT: s_or_b32 s10, s11, s10
-; GFX11-NEXT: s_or_b32 s0, s1, s0
-; GFX11-NEXT: s_or_b32 s1, s3, s2
-; GFX11-NEXT: s_addk_i32 s5, 0x300
-; GFX11-NEXT: s_addk_i32 s6, 0x300
-; GFX11-NEXT: s_addk_i32 s9, 0x300
-; GFX11-NEXT: s_addk_i32 s10, 0x300
-; GFX11-NEXT: s_addk_i32 s0, 0x300
-; GFX11-NEXT: s_addk_i32 s1, 0x300
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v29
-; GFX11-NEXT: v_or_b32_e32 v5, v35, v5
-; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT: v_or_b32_e32 v4, v33, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v23
-; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v18
-; GFX11-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v25
-; GFX11-NEXT: s_addk_i32 s7, 0x300
-; GFX11-NEXT: s_addk_i32 s8, 0x300
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v20
-; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x300, v5
-; GFX11-NEXT: v_or_b32_e32 v5, v34, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v30
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v10
-; GFX11-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v16
-; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT: v_or_b32_e32 v0, v19, v0
-; GFX11-NEXT: v_or_b32_e32 v2, v37, v2
-; GFX11-NEXT: v_or_b32_e32 v7, v32, v7
-; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT: v_or_b32_e32 v4, v22, v4
-; GFX11-NEXT: s_addk_i32 s4, 0x300
-; GFX11-NEXT: v_or_b32_e32 v1, v21, v1
-; GFX11-NEXT: v_or_b32_e32 v3, v17, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x300, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-NEXT: v_or_b32_e32 v6, v31, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-NEXT: v_and_b32_e64 v10, 0xffff, s4
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT: v_lshl_or_b32 v4, v4, 16, v10
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v5, v6, 16, v7
-; GFX11-NEXT: v_lshl_or_b32 v6, v11, 16, v10
-; GFX11-NEXT: v_lshl_or_b32 v7, v8, 16, v9
-; GFX11-NEXT: v_lshl_or_b32 v8, v3, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v9, v1, 16, v0
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: .LBB63_3: ; %end
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB63_4:
-; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-NEXT: s_branch .LBB63_2
+; GFX11-TRUE16-LABEL: bitcast_v40i8_to_v20f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v14 :: v_dual_mov_b32 v28, v12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v10 :: v_dual_mov_b32 v26, v8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v4 :: v_dual_mov_b32 v29, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 8, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 8, v7
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 8, v11
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v13
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v15
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v19
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v21
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB63_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
+; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s17, 8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-TRUE16-NEXT: s_and_b32 s6, s16, 0xff
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s18, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s19, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7
+; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s20, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s21, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s10, s22, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s23, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s8, s9
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s24, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s25, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s10, s26, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s27, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
+; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_and_b32 v1, 0xff, v30
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v29
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v18
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v23
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v26
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v2, v32
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v19
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v27
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v1, v36
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v20
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v21
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v22
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v34
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v28
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v35
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v33
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v25
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v3, v37
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v17
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB63_3
+; GFX11-TRUE16-NEXT: .LBB63_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff
+; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3
+; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3
+; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5
+; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2
+; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v25
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v27
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v26
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v29
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v37, v2
+; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v17, v3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v36, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v24
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v30
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s8
+; GFX11-TRUE16-NEXT: s_and_b32 s9, s16, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s17, 8
+; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s9
+; GFX11-TRUE16-NEXT: s_and_b32 s10, s18, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s19, 8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v33, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10
+; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v10
+; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v20
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v18
+; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v34, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v19, v0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v23
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v32, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v10.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v22, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v35, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-TRUE16-NEXT: .LBB63_3: ; %end
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB63_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-TRUE16-NEXT: s_branch .LBB63_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v40i8_to_v20f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, v14 :: v_dual_mov_b32 v28, v12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, v10 :: v_dual_mov_b32 v26, v8
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, v4 :: v_dual_mov_b32 v29, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 8, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 8, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 8, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 8, v7
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 8, v9
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 8, v11
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 8, v13
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 8, v15
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v19
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v21
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB63_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
+; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v23
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s11, s28, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s29, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10
+; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e64 v2, 0xffff, s10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v30
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v24
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v26
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v3, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v5, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v27
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v6, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v36
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v20
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v6, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v0, 16, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v32
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v9, 16, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v12, 16, v13
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v10, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v2, 16, v1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB63_3
+; GFX11-FAKE16-NEXT: .LBB63_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v27
+; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff
+; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s24, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s25, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5
+; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s27, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s6
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v28
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v36, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v26
+; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s7
+; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s8
+; GFX11-FAKE16-NEXT: s_and_b32 s9, s16, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s17, 8
+; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3
+; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3
+; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3
+; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s9
+; GFX11-FAKE16-NEXT: s_and_b32 s10, s18, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s19, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v24
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v7
+; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s10
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0
+; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2
+; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v29
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v35, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v33, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v23
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v18
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v25
+; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300
+; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v34, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v30
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v19, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v37, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v32, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v22, v4
+; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v21, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v17, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v31, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e64 v10, 0xffff, s4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v6, 16, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v11, 16, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v8, 16, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v3, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT: .LBB63_3: ; %end
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB63_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-FAKE16-NEXT: s_branch .LBB63_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -28791,39 +29019,38 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v25.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v3.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v5.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v48.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v39.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v38.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v37.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v38.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v36.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v36.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v37.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB72_3
@@ -28837,65 +29064,55 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB72_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v35.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v0.l, v34.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v1.h, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v23.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v21.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v10, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v2.l, v28.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v10, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v3.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v26.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v22.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v4.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v10, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v5.l, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v6.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v29.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v30.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v24.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v26.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v27.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v19.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
@@ -28906,146 +29123,110 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v16.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v10.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v10.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v9.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v9
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB72_2
; GFX11-TRUE16-NEXT: .LBB72_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v35.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v30.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v29.h, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v29.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v27.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v23.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v23.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v22.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v21.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v31.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v31.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v34.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v34.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v33.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v27.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v27.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v25.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v24.h, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v28.h, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v29.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v10, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v25.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v26.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v21.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v10, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v23.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v23.h, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v21.h, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v22.h, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v10, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v19.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v19.h, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v18.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v18.h, v6.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v31.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v17.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v17.h, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v16.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v16.h, v8.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v29.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v30.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v33.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v34.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v24.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v25.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v25.h, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v26.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v27.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v18.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v19.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v19.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v20.h, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v21.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v16.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v16.h, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v17.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v17.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v18.l, v9.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v9
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -30878,91 +31059,66 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v27.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v28.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v13.l
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v19.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v14.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v13.l
; GFX11-TRUE16-NEXT: s_clause 0x2
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v5f64_to_v40i8:
@@ -32912,39 +33068,38 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v25.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v3.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v5.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v48.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v39.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v38.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v37.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v38.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v36.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v36.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v37.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB76_3
@@ -32958,65 +33113,55 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB76_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v35.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v0.l, v34.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v1.h, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v23.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v21.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v10, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v2.l, v28.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v10, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v3.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v26.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v22.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v4.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v10, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v5.l, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v6.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v29.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v30.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v24.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v26.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v27.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v19.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
@@ -33027,146 +33172,110 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v16.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v10.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v10.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v9.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v9
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB76_2
; GFX11-TRUE16-NEXT: .LBB76_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v35.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v30.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v29.h, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v29.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v27.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v23.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v23.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v22.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v21.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v31.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v31.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v34.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v34.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v33.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v27.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v27.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v25.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v24.h, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v28.h, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v29.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v10, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v25.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v26.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v21.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v10, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v23.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v23.h, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v21.h, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v22.h, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v10, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v19.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v19.h, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v18.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v18.h, v6.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v31.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v17.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v17.h, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v16.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v16.h, v8.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v29.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v30.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v33.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v34.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v24.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v25.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v25.h, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v26.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v27.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v18.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v19.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v19.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v20.h, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v21.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v16.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v16.h, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v17.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v17.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v18.l, v9.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v9
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -35022,91 +35131,66 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v27.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v28.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v13.l
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v19.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v14.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v13.l
; GFX11-TRUE16-NEXT: s_clause 0x2
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v5i64_to_v40i8:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
index 632b03c..73b57a5 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
@@ -1482,46 +1482,87 @@ define inreg i32 @bitcast_v2bf16_to_i32_scalar(<2 x bfloat> inreg %a, i32 inreg
; GFX9-NEXT: v_mov_b32_e32 v0, s16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v2bf16_to_i32_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s1, 0
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB15_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_vccnz .LBB15_4
-; GFX11-NEXT: .LBB15_2: ; %cmp.true
-; GFX11-NEXT: s_lshl_b32 s1, s0, 16
-; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB15_3:
-; GFX11-NEXT: s_branch .LBB15_2
-; GFX11-NEXT: .LBB15_4:
-; GFX11-NEXT: v_mov_b32_e32 v0, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_i32_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_4
+; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB15_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB15_2
+; GFX11-TRUE16-NEXT: .LBB15_4:
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_i32_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB15_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB15_4
+; GFX11-FAKE16-NEXT: .LBB15_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB15_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB15_2
+; GFX11-FAKE16-NEXT: .LBB15_4:
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -2279,17 +2320,13 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB22_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2
; GFX11-TRUE16-NEXT: .LBB22_4: ; %cmp.true
@@ -2301,13 +2338,9 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -3728,46 +3761,87 @@ define inreg float @bitcast_v2bf16_to_f32_scalar(<2 x bfloat> inreg %a, i32 inre
; GFX9-NEXT: v_mov_b32_e32 v0, s16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v2bf16_to_f32_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s1, 0
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB35_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_vccnz .LBB35_4
-; GFX11-NEXT: .LBB35_2: ; %cmp.true
-; GFX11-NEXT: s_lshl_b32 s1, s0, 16
-; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB35_3:
-; GFX11-NEXT: s_branch .LBB35_2
-; GFX11-NEXT: .LBB35_4:
-; GFX11-NEXT: v_mov_b32_e32 v0, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_f32_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_4
+; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB35_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB35_2
+; GFX11-TRUE16-NEXT: .LBB35_4:
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_f32_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB35_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB35_4
+; GFX11-FAKE16-NEXT: .LBB35_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB35_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB35_2
+; GFX11-FAKE16-NEXT: .LBB35_4:
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -4530,17 +4604,13 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB42_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_2
; GFX11-TRUE16-NEXT: .LBB42_4: ; %cmp.true
@@ -4552,13 +4622,9 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -5440,27 +5506,24 @@ define <2 x i16> @bitcast_v2bf16_to_v2i16(<2 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
; GFX11-TRUE16-NEXT: .LBB50_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -5608,44 +5671,81 @@ define inreg <2 x i16> @bitcast_v2bf16_to_v2i16_scalar(<2 x bfloat> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v0, s16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v2bf16_to_v2i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s1, 0
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB51_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_vccnz .LBB51_4
-; GFX11-NEXT: .LBB51_2: ; %cmp.true
-; GFX11-NEXT: s_lshl_b32 s1, s0, 16
-; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
-; GFX11-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB51_3:
-; GFX11-NEXT: s_branch .LBB51_2
-; GFX11-NEXT: .LBB51_4:
-; GFX11-NEXT: v_mov_b32_e32 v0, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_v2i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_4
+; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v2, v4 :: v_dual_add_nc_u32 v3, v3, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB51_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB51_2
+; GFX11-TRUE16-NEXT: .LBB51_4:
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_v2i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB51_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB51_4
+; GFX11-FAKE16-NEXT: .LBB51_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB51_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB51_2
+; GFX11-FAKE16-NEXT: .LBB51_4:
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -6487,17 +6587,13 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB58_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2
; GFX11-TRUE16-NEXT: .LBB58_4: ; %cmp.true
@@ -6509,13 +6605,9 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -7247,46 +7339,87 @@ define inreg <2 x half> @bitcast_v2bf16_to_v2f16_scalar(<2 x bfloat> inreg %a, i
; GFX9-NEXT: v_mov_b32_e32 v0, s16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v2bf16_to_v2f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s1, 0
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB63_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_vccnz .LBB63_4
-; GFX11-NEXT: .LBB63_2: ; %cmp.true
-; GFX11-NEXT: s_lshl_b32 s1, s0, 16
-; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB63_3:
-; GFX11-NEXT: s_branch .LBB63_2
-; GFX11-NEXT: .LBB63_4:
-; GFX11-NEXT: v_mov_b32_e32 v0, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_v2f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB63_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB63_4
+; GFX11-TRUE16-NEXT: .LBB63_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB63_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB63_2
+; GFX11-TRUE16-NEXT: .LBB63_4:
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_v2f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB63_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB63_4
+; GFX11-FAKE16-NEXT: .LBB63_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB63_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB63_2
+; GFX11-FAKE16-NEXT: .LBB63_4:
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -8138,17 +8271,13 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2
; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true
@@ -8160,13 +8289,9 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -8685,46 +8810,87 @@ define inreg <1 x i32> @bitcast_v2bf16_to_v1i32_scalar(<2 x bfloat> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v0, s16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v2bf16_to_v1i32_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s1, 0
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB73_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_vccnz .LBB73_4
-; GFX11-NEXT: .LBB73_2: ; %cmp.true
-; GFX11-NEXT: s_lshl_b32 s1, s0, 16
-; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB73_3:
-; GFX11-NEXT: s_branch .LBB73_2
-; GFX11-NEXT: .LBB73_4:
-; GFX11-NEXT: v_mov_b32_e32 v0, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_v1i32_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB73_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB73_4
+; GFX11-TRUE16-NEXT: .LBB73_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB73_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB73_2
+; GFX11-TRUE16-NEXT: .LBB73_4:
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_v1i32_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB73_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB73_4
+; GFX11-FAKE16-NEXT: .LBB73_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB73_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB73_2
+; GFX11-FAKE16-NEXT: .LBB73_4:
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -9290,57 +9456,109 @@ define inreg <4 x i8> @bitcast_v2bf16_to_v4i8_scalar(<2 x bfloat> inreg %a, i32
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v2bf16_to_v4i8_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s1, 0
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB77_3
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: s_lshr_b32 s2, s0, 24
-; GFX11-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-NEXT: s_lshr_b32 s3, s0, 8
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_vccnz .LBB77_4
-; GFX11-NEXT: .LBB77_2: ; %cmp.true
-; GFX11-NEXT: s_lshl_b32 s1, s0, 16
-; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB77_3:
-; GFX11-NEXT: ; implicit-def: $sgpr3
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr2
-; GFX11-NEXT: s_branch .LBB77_2
-; GFX11-NEXT: .LBB77_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s2
-; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_v4i8_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB77_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s0, 8
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB77_4
+; GFX11-TRUE16-NEXT: .LBB77_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB77_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr3
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2
+; GFX11-TRUE16-NEXT: s_branch .LBB77_2
+; GFX11-TRUE16-NEXT: .LBB77_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v1, s3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_v4i8_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB77_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s0, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s0, 8
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB77_4
+; GFX11-FAKE16-NEXT: .LBB77_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 24, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB77_3:
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr3
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr2
+; GFX11-FAKE16-NEXT: s_branch .LBB77_2
+; GFX11-FAKE16-NEXT: .LBB77_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v1, s3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -9502,17 +9720,13 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB78_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB78_2
; GFX11-TRUE16-NEXT: .LBB78_4: ; %cmp.true
@@ -9524,13 +9738,9 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -10212,17 +10422,13 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB82_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB82_2
; GFX11-TRUE16-NEXT: .LBB82_4: ; %cmp.true
@@ -10234,13 +10440,9 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
index acc0247..d5d2d4aa 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
@@ -374,59 +374,112 @@ define inreg <3 x half> @bitcast_v3bf16_to_v3f16_scalar(<3 x bfloat> inreg %a, i
; GFX9-NEXT: v_mov_b32_e32 v1, s17
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v3bf16_to_v3f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-NEXT: s_mov_b32 s2, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB1_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
-; GFX11-NEXT: s_cbranch_vccnz .LBB1_4
-; GFX11-NEXT: .LBB1_2: ; %cmp.true
-; GFX11-NEXT: s_lshl_b32 s2, s0, 16
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
-; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v5, v8 :: v_dual_and_b32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshl_or_b32 v1, 0x7fc0, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB1_3:
-; GFX11-NEXT: s_branch .LBB1_2
-; GFX11-NEXT: .LBB1_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v3bf16_to_v3f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB1_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB1_4
+; GFX11-TRUE16-NEXT: .LBB1_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, 0x7fc0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v4.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v3.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB1_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB1_2
+; GFX11-TRUE16-NEXT: .LBB1_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v3bf16_to_v3f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB1_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB1_4
+; GFX11-FAKE16-NEXT: .LBB1_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s0, 16
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v5, v8 :: v_dual_and_b32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, 0x7fc0, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v2, 16, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB1_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB1_2
+; GFX11-FAKE16-NEXT: .LBB1_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -803,38 +856,36 @@ define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v7, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, 0x7fc0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, 0x7fc0, 16, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
; GFX11-TRUE16-NEXT: .LBB4_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -1025,56 +1076,105 @@ define inreg <3 x i16> @bitcast_v3bf16_to_v3i16_scalar(<3 x bfloat> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v1, s17
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v3bf16_to_v3i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-NEXT: s_mov_b32 s2, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB5_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
-; GFX11-NEXT: s_cbranch_vccnz .LBB5_4
-; GFX11-NEXT: .LBB5_2: ; %cmp.true
-; GFX11-NEXT: s_lshl_b32 s2, s0, 16
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
-; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v1, 0x7fc0, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_and_or_b32 v0, 0xffff0000, v2, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB5_3:
-; GFX11-NEXT: s_branch .LBB5_2
-; GFX11-NEXT: .LBB5_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v3bf16_to_v3i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB5_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB5_4
+; GFX11-TRUE16-NEXT: .LBB5_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s0, 16
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v3, v7 :: v_dual_add_nc_u32 v4, v4, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, 0x7fc0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v4.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB5_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB5_2
+; GFX11-TRUE16-NEXT: .LBB5_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v3bf16_to_v3i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB5_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB5_4
+; GFX11-FAKE16-NEXT: .LBB5_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s0, 16
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, 0x7fc0, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xffff0000, v2, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB5_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB5_2
+; GFX11-FAKE16-NEXT: .LBB5_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
index d3fbba3..ee23420 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
@@ -7351,360 +7351,696 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v32bf16_to_v16i32_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s15, s3
-; GFX11-NEXT: s_mov_b32 s14, s2
-; GFX11-NEXT: s_mov_b32 s13, s1
-; GFX11-NEXT: s_mov_b32 s12, s0
-; GFX11-NEXT: s_cmp_lg_u32 s28, 0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB23_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB23_4
-; GFX11-NEXT: .LBB23_2: ; %cmp.true
-; GFX11-NEXT: s_and_b32 s1, s27, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s0, s27, 16
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s26, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s2, s26, 16
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1
-; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_and_b32 s1, s25, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s3, s25, 16
-; GFX11-NEXT: s_and_b32 s0, s24, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s1, s24, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s3
-; GFX11-NEXT: v_bfe_u32 v5, v6, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v5, v6
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s1, s23, 16
-; GFX11-NEXT: v_lshl_or_b32 v14, v0, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s23, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: s_lshl_b32 s1, s22, 16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s22, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v13, v0, 16, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s21, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7
-; GFX11-NEXT: s_lshl_b32 s1, s21, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v12, v0, 16, v1
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s20, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s1, s20, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8
-; GFX11-NEXT: v_lshl_or_b32 v11, v0, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT: v_lshl_or_b32 v10, v0, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v8, v7
-; GFX11-NEXT: s_lshl_b32 s0, s19, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s19, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_bfe_u32 v16, v4, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s18, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_bfe_u32 v6, v8, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v16, v4
-; GFX11-NEXT: v_lshl_or_b32 v9, v1, 16, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v6, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v4
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s1, s18, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s0, s17, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_lshl_or_b32 v8, v2, 16, v0
-; GFX11-NEXT: s_and_b32 s1, s17, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v6, v4
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v7, v5
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v17, v6, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v5
-; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v17, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v19, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: s_lshl_b32 s1, s16, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v18, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v7
-; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s1
-; GFX11-NEXT: s_and_b32 s0, s16, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v16, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v17, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v17, v18, 16, 1
-; GFX11-NEXT: v_bfe_u32 v6, v16, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v7, v0, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v17, v18
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v19, v6, v16
-; GFX11-NEXT: v_lshl_or_b32 v6, v2, 16, v3
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT: s_and_b32 s0, s15, 0xffff0000
-; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v19
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v16
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT: s_lshl_b32 s1, s15, 16
-; GFX11-NEXT: s_and_b32 s0, s14, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s1
-; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v3, v17, 16, 1
-; GFX11-NEXT: v_bfe_u32 v18, v16, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4
-; GFX11-NEXT: s_lshl_b32 s0, s14, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v17
-; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v18, v18, v16
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: s_and_b32 s0, s13, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v19, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v18
-; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v16
-; GFX11-NEXT: v_bfe_u32 v19, v4, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v20, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_dual_cndmask_b32 v16, v17, v18 :: v_dual_add_nc_u32 v17, v19, v4
-; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s13, 16
-; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v4
-; GFX11-NEXT: v_add_f32_e64 v21, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: s_lshl_b32 s0, s12, 16
-; GFX11-NEXT: v_bfe_u32 v20, v18, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v21
-; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v18
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v17, v19, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v19, v21, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s12, 0xffff0000
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v23, v17, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v19, v19, v21
-; GFX11-NEXT: v_add_nc_u32_e32 v20, v20, v18
-; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v17
-; GFX11-NEXT: v_bfe_u32 v24, v22, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v23, v23, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v19, 0x7fff, v19
-; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v24, v24, v22
-; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23
-; GFX11-NEXT: v_cndmask_b32_e32 v19, v19, v26, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v24
-; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; GFX11-NEXT: v_cndmask_b32_e32 v17, v23, v27, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v3, v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; GFX11-NEXT: v_cndmask_b32_e32 v18, v20, v25, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; GFX11-NEXT: v_cndmask_b32_e32 v20, v21, v24, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v4
-; GFX11-NEXT: v_lshl_or_b32 v4, v1, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v1, v18, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshl_or_b32 v2, v16, 16, v21
-; GFX11-NEXT: v_lshl_or_b32 v0, v20, 16, v17
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB23_3:
-; GFX11-NEXT: s_branch .LBB23_2
-; GFX11-NEXT: .LBB23_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
-; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
-; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v16i32_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s13, s1
+; GFX11-TRUE16-NEXT: s_mov_b32 s12, s0
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s28, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB23_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB23_4
+; GFX11-TRUE16-NEXT: .LBB23_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s27, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s27, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s26, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s26, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s25, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s17, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v6 :: v_dual_add_nc_u32 v7, v7, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v9, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s25, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s24, 0xffff0000
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v5, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s24, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v6, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s23, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v0.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v2, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s23, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v0.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s22, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v3, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s22, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s21, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v2.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v9
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s21, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v5
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s20, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v3, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s20, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s19, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v3, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s19, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v16, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s18, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s18, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v17, v5
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s17, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v3, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v16, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v17 :: v_dual_add_nc_u32 v3, v3, v4
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s16, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v1.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v16
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s16, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v17, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s15, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v17
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v1, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s15, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v18, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, v19, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s14, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s14, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s12, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v16, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v17, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s13, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, v16, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v17, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v18, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, v19, v17
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v18, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s13, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v16, v20, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v0, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v16, v19, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v18
+; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v17, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v16, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s12, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v20, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v19 :: v_dual_add_nc_u32 v19, v21, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, v22, v17
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v20, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x7fff, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, v18, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v21, v22, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v19, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v18, v24, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v17.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB23_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB23_2
+; GFX11-TRUE16-NEXT: .LBB23_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v32bf16_to_v16i32_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s15, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s14, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s13, s1
+; GFX11-FAKE16-NEXT: s_mov_b32 s12, s0
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s28, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB23_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB23_4
+; GFX11-FAKE16-NEXT: .LBB23_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s27, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s27, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s26, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s26, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s25, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s25, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s24, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v9, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v5, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s23, 16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s23, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s22, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s22, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s21, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s21, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s20, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s20, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v8, v7
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s19, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s19, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v4, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s18, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v16, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v1, 16, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v6, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s18, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s17, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v2, 16, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s17, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v6, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v7, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v17, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v19, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s16, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v18, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v18, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v16, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v17, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, v6, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v2, 16, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s15, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v5, 16, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s15, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s14, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v17, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v18, v16, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s14, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v17
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, v18, v16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s13, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v20, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v16, v17, v18 :: v_dual_add_nc_u32 v17, v19, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s13, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v21, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s12, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v20, v18, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v21
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v18
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v17, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v21, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s12, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v22, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v17, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, v19, v21
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, v20, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v17
+; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v22, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, v23, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 0x7fff, v19
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, v24, v22
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v19, v26, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v24
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, 0x400000, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v23, v27, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v20, v25, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v21, v24, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v1, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v18, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v16, 16, v21
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v20, 16, v17
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB23_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB23_2
+; GFX11-FAKE16-NEXT: .LBB23_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -8921,133 +9257,98 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v18.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v53.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v50.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v31.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v34.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v19.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v27.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v19.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v18.l
; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v19.l
; GFX11-TRUE16-NEXT: s_clause 0x3
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
@@ -12574,53 +12875,52 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v20.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v27.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v80.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v68.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v70.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_3
@@ -12633,98 +12933,82 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v53.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v52.l
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v0.l, v54.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v55.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v54.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v49.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v64, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v1.h, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v50.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v50.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v49.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v29.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v29.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v51.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v64, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v2.l, v51.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v64, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v3.l, v50.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v50.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v30.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v64, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v4.l, v39.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v5.l, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v37.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v64, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v6.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v64.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v64, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v7.l, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v21.h
; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v64, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v8.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v20.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v64, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v9.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v64.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v32.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v64, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v10.l, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v54.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v51.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v51.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v53.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v30.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v39.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v39.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v48.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v48.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v24.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v26.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v27.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v19.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v18.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
@@ -12745,226 +13029,170 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v64, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v11.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v16.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v64, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v12.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v64.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v64, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v13.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v64.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v64, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v14.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v64.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v64, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v15.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v64.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v64, v15
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2
; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v52.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v53.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v50.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v50.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v49.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v29.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v27.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v23.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v37.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v35.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v33.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v31.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v31.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v55.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v54.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v53.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v49.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v52, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v39.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v51.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v51.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v52, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v52.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v50.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v50.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v52, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v24.h, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v39.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.l, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v52, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v29.h, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v30.h, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v27.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v27.h, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v6.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v52, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v37.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v23.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v23.h, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v52, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v52, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v21.l, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v21.h, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v52, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v20.h, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v52, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v33.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v19.l, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v19.h, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v18.h, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v31.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v17.h, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v52, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.h, v14.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v52, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v51.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v51.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v52.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v52.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v53.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v30.h, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v39.l, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v39.h, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v48.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.h, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v24.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v25.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.h, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v26.h, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v27.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v21.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v21.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v22.h, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v23.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v18.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v19.h, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v20.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v20.h, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v16.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.h, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v17.l, v14.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v18.l, v15.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v10.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v14.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v15.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v52, v15
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -22014,360 +22242,696 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v32bf16_to_v16f32_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s15, s3
-; GFX11-NEXT: s_mov_b32 s14, s2
-; GFX11-NEXT: s_mov_b32 s13, s1
-; GFX11-NEXT: s_mov_b32 s12, s0
-; GFX11-NEXT: s_cmp_lg_u32 s28, 0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB47_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB47_4
-; GFX11-NEXT: .LBB47_2: ; %cmp.true
-; GFX11-NEXT: s_and_b32 s1, s27, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s0, s27, 16
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s26, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s2, s26, 16
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1
-; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_and_b32 s1, s25, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s3, s25, 16
-; GFX11-NEXT: s_and_b32 s0, s24, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s1, s24, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s3
-; GFX11-NEXT: v_bfe_u32 v5, v6, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v5, v6
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s1, s23, 16
-; GFX11-NEXT: v_lshl_or_b32 v14, v0, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s23, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: s_lshl_b32 s1, s22, 16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s22, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v13, v0, 16, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s21, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7
-; GFX11-NEXT: s_lshl_b32 s1, s21, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v12, v0, 16, v1
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s20, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s1, s20, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8
-; GFX11-NEXT: v_lshl_or_b32 v11, v0, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT: v_lshl_or_b32 v10, v0, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v8, v7
-; GFX11-NEXT: s_lshl_b32 s0, s19, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s19, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_bfe_u32 v16, v4, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s18, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_bfe_u32 v6, v8, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v16, v4
-; GFX11-NEXT: v_lshl_or_b32 v9, v1, 16, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v6, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v4
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s1, s18, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s0, s17, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_lshl_or_b32 v8, v2, 16, v0
-; GFX11-NEXT: s_and_b32 s1, s17, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v6, v4
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v7, v5
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v17, v6, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v5
-; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v17, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v19, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: s_lshl_b32 s1, s16, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v18, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v7
-; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s1
-; GFX11-NEXT: s_and_b32 s0, s16, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v16, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v17, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v17, v18, 16, 1
-; GFX11-NEXT: v_bfe_u32 v6, v16, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v7, v0, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v17, v18
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v19, v6, v16
-; GFX11-NEXT: v_lshl_or_b32 v6, v2, 16, v3
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT: s_and_b32 s0, s15, 0xffff0000
-; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v19
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v16
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT: s_lshl_b32 s1, s15, 16
-; GFX11-NEXT: s_and_b32 s0, s14, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s1
-; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v3, v17, 16, 1
-; GFX11-NEXT: v_bfe_u32 v18, v16, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4
-; GFX11-NEXT: s_lshl_b32 s0, s14, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v17
-; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v18, v18, v16
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: s_and_b32 s0, s13, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v19, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v18
-; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v16
-; GFX11-NEXT: v_bfe_u32 v19, v4, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v20, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_dual_cndmask_b32 v16, v17, v18 :: v_dual_add_nc_u32 v17, v19, v4
-; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s13, 16
-; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v4
-; GFX11-NEXT: v_add_f32_e64 v21, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: s_lshl_b32 s0, s12, 16
-; GFX11-NEXT: v_bfe_u32 v20, v18, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v21
-; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v18
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v17, v19, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v19, v21, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s12, 0xffff0000
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v23, v17, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v19, v19, v21
-; GFX11-NEXT: v_add_nc_u32_e32 v20, v20, v18
-; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v17
-; GFX11-NEXT: v_bfe_u32 v24, v22, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v23, v23, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v19, 0x7fff, v19
-; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v24, v24, v22
-; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23
-; GFX11-NEXT: v_cndmask_b32_e32 v19, v19, v26, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v24
-; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; GFX11-NEXT: v_cndmask_b32_e32 v17, v23, v27, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v3, v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; GFX11-NEXT: v_cndmask_b32_e32 v18, v20, v25, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; GFX11-NEXT: v_cndmask_b32_e32 v20, v21, v24, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v4
-; GFX11-NEXT: v_lshl_or_b32 v4, v1, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v1, v18, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshl_or_b32 v2, v16, 16, v21
-; GFX11-NEXT: v_lshl_or_b32 v0, v20, 16, v17
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB47_3:
-; GFX11-NEXT: s_branch .LBB47_2
-; GFX11-NEXT: .LBB47_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
-; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
-; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v16f32_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s13, s1
+; GFX11-TRUE16-NEXT: s_mov_b32 s12, s0
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s28, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_4
+; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s27, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s27, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s26, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s26, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s25, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s17, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v6 :: v_dual_add_nc_u32 v7, v7, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v9, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s25, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s24, 0xffff0000
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v5, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s24, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v6, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s23, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v0.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v2, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s23, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v0.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s22, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v3, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s22, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s21, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v2.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v9
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s21, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v5
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s20, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v3, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s20, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s19, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v3, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s19, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v16, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s18, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s18, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v17, v5
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s17, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v3, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v16, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v17 :: v_dual_add_nc_u32 v3, v3, v4
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s16, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v1.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v16
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s16, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v17, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s15, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v17
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v1, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s15, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v18, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, v19, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s14, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s14, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s12, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v16, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v17, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s13, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, v16, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v17, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v18, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, v19, v17
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v18, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s13, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v16, v20, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v0, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v16, v19, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v18
+; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v17, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v16, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s12, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v20, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v19 :: v_dual_add_nc_u32 v19, v21, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, v22, v17
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v20, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x7fff, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, v18, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v21, v22, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v19, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v18, v24, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v17.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB47_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB47_2
+; GFX11-TRUE16-NEXT: .LBB47_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v32bf16_to_v16f32_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s15, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s14, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s13, s1
+; GFX11-FAKE16-NEXT: s_mov_b32 s12, s0
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s28, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB47_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB47_4
+; GFX11-FAKE16-NEXT: .LBB47_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s27, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s27, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s26, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s26, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s25, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s25, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s24, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v9, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v5, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s23, 16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s23, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s22, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s22, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s21, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s21, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s20, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s20, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v8, v7
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s19, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s19, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v4, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s18, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v16, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v1, 16, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v6, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s18, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s17, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v2, 16, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s17, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v6, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v7, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v17, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v19, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s16, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v18, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v18, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v16, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v17, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, v6, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v2, 16, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s15, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v5, 16, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s15, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s14, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v17, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v18, v16, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s14, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v17
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, v18, v16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s13, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v20, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v16, v17, v18 :: v_dual_add_nc_u32 v17, v19, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s13, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v21, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s12, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v20, v18, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v21
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v18
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v17, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v21, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s12, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v22, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v17, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, v19, v21
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, v20, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v17
+; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v22, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, v23, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 0x7fff, v19
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, v24, v22
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v19, v26, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v24
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, 0x400000, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v23, v27, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v20, v25, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v21, v24, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v1, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v18, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v16, 16, v21
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v20, 16, v17
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB47_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB47_2
+; GFX11-FAKE16-NEXT: .LBB47_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -23576,133 +24140,98 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v18.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v53.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v50.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v31.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v34.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v19.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v27.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v19.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v18.l
; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v19.l
; GFX11-TRUE16-NEXT: s_clause 0x3
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
@@ -27358,53 +27887,52 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v20.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v27.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v80.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v68.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v70.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_3
@@ -27417,98 +27945,82 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v53.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v52.l
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v0.l, v54.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v55.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v54.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v49.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v64, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v1.h, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v50.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v50.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v49.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v29.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v29.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v51.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v64, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v2.l, v51.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v64, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v3.l, v50.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v50.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v30.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v64, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v4.l, v39.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v5.l, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v37.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v64, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v6.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v64.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v64, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v7.l, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v21.h
; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v64, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v8.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v20.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v64, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v9.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v64.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v32.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v64, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v10.l, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v54.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v51.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v51.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v53.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v30.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v39.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v39.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v48.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v48.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v24.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v26.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v27.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v19.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v18.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
@@ -27529,226 +28041,170 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v64, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v11.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v16.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v64, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v12.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v64.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v64, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v13.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v64.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v64, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v14.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v64.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v64, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v15.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v64.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v64, v15
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2
; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v52.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v53.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v50.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v50.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v49.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v29.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v27.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v23.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v37.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v35.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v33.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v31.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v31.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v55.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v54.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v53.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v49.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v52, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v39.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v51.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v51.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v52, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v52.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v50.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v50.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v52, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v24.h, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v39.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.l, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v52, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v29.h, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v30.h, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v27.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v27.h, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v6.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v52, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v37.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v23.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v23.h, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v52, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v52, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v21.l, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v21.h, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v52, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v20.h, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v52, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v33.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v19.l, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v19.h, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v18.h, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v31.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v17.h, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v52, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.h, v14.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v52, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v51.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v51.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v52.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v52.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v53.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v30.h, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v39.l, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v39.h, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v48.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.h, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v24.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v25.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.h, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v26.h, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v27.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v21.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v21.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v22.h, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v23.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v18.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v19.h, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v20.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v20.h, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v16.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.h, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v17.l, v14.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v18.l, v15.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v10.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v14.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v15.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v52, v15
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -36185,360 +36641,696 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a,
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v32bf16_to_v8i64_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s15, s3
-; GFX11-NEXT: s_mov_b32 s14, s2
-; GFX11-NEXT: s_mov_b32 s13, s1
-; GFX11-NEXT: s_mov_b32 s12, s0
-; GFX11-NEXT: s_cmp_lg_u32 s28, 0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB67_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB67_4
-; GFX11-NEXT: .LBB67_2: ; %cmp.true
-; GFX11-NEXT: s_and_b32 s1, s27, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s0, s27, 16
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s26, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s2, s26, 16
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1
-; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_and_b32 s1, s25, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s3, s25, 16
-; GFX11-NEXT: s_and_b32 s0, s24, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s1, s24, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s3
-; GFX11-NEXT: v_bfe_u32 v5, v6, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v5, v6
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s1, s23, 16
-; GFX11-NEXT: v_lshl_or_b32 v14, v0, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s23, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: s_lshl_b32 s1, s22, 16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s22, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v13, v0, 16, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s21, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7
-; GFX11-NEXT: s_lshl_b32 s1, s21, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v12, v0, 16, v1
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s20, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s1, s20, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8
-; GFX11-NEXT: v_lshl_or_b32 v11, v0, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT: v_lshl_or_b32 v10, v0, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v8, v7
-; GFX11-NEXT: s_lshl_b32 s0, s19, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s19, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_bfe_u32 v16, v4, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s18, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_bfe_u32 v6, v8, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v16, v4
-; GFX11-NEXT: v_lshl_or_b32 v9, v1, 16, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v6, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v4
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s1, s18, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s0, s17, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_lshl_or_b32 v8, v2, 16, v0
-; GFX11-NEXT: s_and_b32 s1, s17, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v6, v4
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v7, v5
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v17, v6, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v5
-; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v17, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v19, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: s_lshl_b32 s1, s16, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v18, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v7
-; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s1
-; GFX11-NEXT: s_and_b32 s0, s16, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v16, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v17, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v17, v18, 16, 1
-; GFX11-NEXT: v_bfe_u32 v6, v16, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v7, v0, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v17, v18
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v19, v6, v16
-; GFX11-NEXT: v_lshl_or_b32 v6, v2, 16, v3
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT: s_and_b32 s0, s15, 0xffff0000
-; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v19
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v16
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT: s_lshl_b32 s1, s15, 16
-; GFX11-NEXT: s_and_b32 s0, s14, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s1
-; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v3, v17, 16, 1
-; GFX11-NEXT: v_bfe_u32 v18, v16, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4
-; GFX11-NEXT: s_lshl_b32 s0, s14, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v17
-; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v18, v18, v16
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: s_and_b32 s0, s13, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v19, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v18
-; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v16
-; GFX11-NEXT: v_bfe_u32 v19, v4, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v20, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_dual_cndmask_b32 v16, v17, v18 :: v_dual_add_nc_u32 v17, v19, v4
-; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s13, 16
-; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v4
-; GFX11-NEXT: v_add_f32_e64 v21, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: s_lshl_b32 s0, s12, 16
-; GFX11-NEXT: v_bfe_u32 v20, v18, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v21
-; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v18
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v17, v19, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v19, v21, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s12, 0xffff0000
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v23, v17, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v19, v19, v21
-; GFX11-NEXT: v_add_nc_u32_e32 v20, v20, v18
-; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v17
-; GFX11-NEXT: v_bfe_u32 v24, v22, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v23, v23, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v19, 0x7fff, v19
-; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v24, v24, v22
-; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23
-; GFX11-NEXT: v_cndmask_b32_e32 v19, v19, v26, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v24
-; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; GFX11-NEXT: v_cndmask_b32_e32 v17, v23, v27, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v3, v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; GFX11-NEXT: v_cndmask_b32_e32 v18, v20, v25, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; GFX11-NEXT: v_cndmask_b32_e32 v20, v21, v24, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v4
-; GFX11-NEXT: v_lshl_or_b32 v4, v1, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v1, v18, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshl_or_b32 v2, v16, 16, v21
-; GFX11-NEXT: v_lshl_or_b32 v0, v20, 16, v17
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB67_3:
-; GFX11-NEXT: s_branch .LBB67_2
-; GFX11-NEXT: .LBB67_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
-; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
-; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v8i64_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s13, s1
+; GFX11-TRUE16-NEXT: s_mov_b32 s12, s0
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s28, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB67_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB67_4
+; GFX11-TRUE16-NEXT: .LBB67_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s27, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s27, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s26, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s26, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s25, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s17, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v6 :: v_dual_add_nc_u32 v7, v7, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v9, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s25, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s24, 0xffff0000
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v5, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s24, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v6, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s23, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v0.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v2, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s23, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v0.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s22, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v3, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s22, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s21, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v2.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v9
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s21, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v5
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s20, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v3, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s20, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s19, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v3, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s19, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v16, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s18, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s18, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v17, v5
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s17, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v3, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v16, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v17 :: v_dual_add_nc_u32 v3, v3, v4
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s16, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v1.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v16
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s16, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v17, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s15, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v17
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v1, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s15, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v18, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, v19, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s14, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s14, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s12, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v16, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v17, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s13, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, v16, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v17, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v18, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, v19, v17
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v18, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s13, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v16, v20, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v0, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v16, v19, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v18
+; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v17, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v16, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s12, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v20, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v19 :: v_dual_add_nc_u32 v19, v21, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, v22, v17
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v20, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x7fff, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, v18, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v21, v22, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v19, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v18, v24, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v17.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB67_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB67_2
+; GFX11-TRUE16-NEXT: .LBB67_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v32bf16_to_v8i64_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s15, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s14, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s13, s1
+; GFX11-FAKE16-NEXT: s_mov_b32 s12, s0
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s28, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB67_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB67_4
+; GFX11-FAKE16-NEXT: .LBB67_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s27, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s27, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s26, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s26, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s25, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s25, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s24, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v9, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v5, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s23, 16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s23, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s22, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s22, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s21, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s21, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s20, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s20, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v8, v7
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s19, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s19, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v4, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s18, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v16, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v1, 16, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v6, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s18, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s17, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v2, 16, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s17, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v6, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v7, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v17, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v19, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s16, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v18, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v18, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v16, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v17, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, v6, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v2, 16, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s15, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v5, 16, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s15, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s14, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v17, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v18, v16, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s14, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v17
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, v18, v16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s13, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v20, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v16, v17, v18 :: v_dual_add_nc_u32 v17, v19, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s13, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v21, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s12, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v20, v18, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v21
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v18
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v17, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v21, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s12, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v22, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v17, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, v19, v21
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, v20, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v17
+; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v22, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, v23, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 0x7fff, v19
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, v24, v22
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v19, v26, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v24
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, 0x400000, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v23, v27, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v20, v25, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v21, v24, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v1, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v18, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v16, 16, v21
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v20, 16, v17
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB67_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB67_2
+; GFX11-FAKE16-NEXT: .LBB67_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -37760,133 +38552,98 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v18.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v53.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v50.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v31.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v34.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v19.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v27.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v19.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v18.l
; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v19.l
; GFX11-TRUE16-NEXT: s_clause 0x3
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
@@ -41418,53 +42175,52 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v20.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v27.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v80.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v68.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v70.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_3
@@ -41477,98 +42233,82 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v53.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v52.l
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v0.l, v54.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v55.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v54.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v49.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v64, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v1.h, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v50.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v50.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v49.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v29.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v29.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v51.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v64, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v2.l, v51.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v64, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v3.l, v50.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v50.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v30.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v64, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v4.l, v39.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v5.l, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v37.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v64, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v6.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v64.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v64, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v7.l, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v21.h
; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v64, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v8.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v20.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v64, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v9.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v64.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v32.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v64, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v10.l, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v54.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v51.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v51.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v53.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v30.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v39.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v39.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v48.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v48.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v24.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v26.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v27.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v19.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v18.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
@@ -41589,226 +42329,170 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v64, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v11.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v16.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v64, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v12.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v64.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v64, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v13.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v64.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v64, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v14.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v64.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v64, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v15.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v64.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v64, v15
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2
; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v52.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v53.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v50.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v50.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v49.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v29.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v27.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v23.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v37.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v35.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v33.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v31.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v31.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v55.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v54.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v53.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v49.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v52, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v39.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v51.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v51.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v52, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v52.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v50.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v50.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v52, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v24.h, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v39.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.l, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v52, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v29.h, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v30.h, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v27.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v27.h, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v6.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v52, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v37.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v23.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v23.h, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v52, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v52, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v21.l, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v21.h, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v52, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v20.h, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v52, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v33.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v19.l, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v19.h, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v18.h, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v31.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v17.h, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v52, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.h, v14.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v52, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v51.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v51.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v52.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v52.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v53.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v30.h, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v39.l, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v39.h, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v48.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.h, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v24.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v25.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.h, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v26.h, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v27.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v21.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v21.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v22.h, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v23.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v18.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v19.h, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v20.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v20.h, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v16.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.h, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v17.l, v14.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v18.l, v15.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v10.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v14.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v15.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v52, v15
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -49416,360 +50100,696 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v32bf16_to_v8f64_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s15, s3
-; GFX11-NEXT: s_mov_b32 s14, s2
-; GFX11-NEXT: s_mov_b32 s13, s1
-; GFX11-NEXT: s_mov_b32 s12, s0
-; GFX11-NEXT: s_cmp_lg_u32 s28, 0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB83_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB83_4
-; GFX11-NEXT: .LBB83_2: ; %cmp.true
-; GFX11-NEXT: s_and_b32 s1, s27, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s0, s27, 16
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s26, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s2, s26, 16
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1
-; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_and_b32 s1, s25, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s3, s25, 16
-; GFX11-NEXT: s_and_b32 s0, s24, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s1, s24, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s3
-; GFX11-NEXT: v_bfe_u32 v5, v6, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v5, v6
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s1, s23, 16
-; GFX11-NEXT: v_lshl_or_b32 v14, v0, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s23, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: s_lshl_b32 s1, s22, 16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s22, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v13, v0, 16, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s21, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7
-; GFX11-NEXT: s_lshl_b32 s1, s21, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v12, v0, 16, v1
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s20, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s1, s20, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8
-; GFX11-NEXT: v_lshl_or_b32 v11, v0, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT: v_lshl_or_b32 v10, v0, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v8, v7
-; GFX11-NEXT: s_lshl_b32 s0, s19, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s19, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_bfe_u32 v16, v4, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s18, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_bfe_u32 v6, v8, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v16, v4
-; GFX11-NEXT: v_lshl_or_b32 v9, v1, 16, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v6, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v4
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s1, s18, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s0, s17, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_lshl_or_b32 v8, v2, 16, v0
-; GFX11-NEXT: s_and_b32 s1, s17, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v6, v4
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v7, v5
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v17, v6, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v5
-; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v17, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v19, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: s_lshl_b32 s1, s16, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v18, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v7
-; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s1
-; GFX11-NEXT: s_and_b32 s0, s16, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v16, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v17, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v17, v18, 16, 1
-; GFX11-NEXT: v_bfe_u32 v6, v16, 16, 1
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v7, v0, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v17, v18
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v19, v6, v16
-; GFX11-NEXT: v_lshl_or_b32 v6, v2, 16, v3
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT: s_and_b32 s0, s15, 0xffff0000
-; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v19
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v16
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT: s_lshl_b32 s1, s15, 16
-; GFX11-NEXT: s_and_b32 s0, s14, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s1
-; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v3, v17, 16, 1
-; GFX11-NEXT: v_bfe_u32 v18, v16, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4
-; GFX11-NEXT: s_lshl_b32 s0, s14, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v17
-; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v18, v18, v16
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: s_and_b32 s0, s13, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v19, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v18
-; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v16
-; GFX11-NEXT: v_bfe_u32 v19, v4, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v20, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_dual_cndmask_b32 v16, v17, v18 :: v_dual_add_nc_u32 v17, v19, v4
-; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s13, 16
-; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v4
-; GFX11-NEXT: v_add_f32_e64 v21, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: s_lshl_b32 s0, s12, 16
-; GFX11-NEXT: v_bfe_u32 v20, v18, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v21
-; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v18
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v17, v19, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v19, v21, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s12, 0xffff0000
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v23, v17, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v19, v19, v21
-; GFX11-NEXT: v_add_nc_u32_e32 v20, v20, v18
-; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v17
-; GFX11-NEXT: v_bfe_u32 v24, v22, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v23, v23, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v19, 0x7fff, v19
-; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v24, v24, v22
-; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23
-; GFX11-NEXT: v_cndmask_b32_e32 v19, v19, v26, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v24
-; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; GFX11-NEXT: v_cndmask_b32_e32 v17, v23, v27, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v3, v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; GFX11-NEXT: v_cndmask_b32_e32 v18, v20, v25, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; GFX11-NEXT: v_cndmask_b32_e32 v20, v21, v24, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v4
-; GFX11-NEXT: v_lshl_or_b32 v4, v1, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v1, v18, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshl_or_b32 v2, v16, 16, v21
-; GFX11-NEXT: v_lshl_or_b32 v0, v20, 16, v17
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB83_3:
-; GFX11-NEXT: s_branch .LBB83_2
-; GFX11-NEXT: .LBB83_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
-; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
-; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v8f64_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s13, s1
+; GFX11-TRUE16-NEXT: s_mov_b32 s12, s0
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s28, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB83_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB83_4
+; GFX11-TRUE16-NEXT: .LBB83_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s27, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s27, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s26, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s26, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s25, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s17, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v6 :: v_dual_add_nc_u32 v7, v7, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v9, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s25, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s24, 0xffff0000
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v5, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s24, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v6, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s23, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v0.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v2, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s23, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v0.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s22, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v3, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s22, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s21, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v2.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v9
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s21, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v5
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s20, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v3, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s20, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s19, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v3, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s19, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v16, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s18, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s18, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v17, v5
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s17, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v3, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v16, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v17 :: v_dual_add_nc_u32 v3, v3, v4
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s16, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v1.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v16
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s16, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v17, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s15, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v17
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v1, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s15, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v18, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, v19, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s14, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s14, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s12, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v16, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v17, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s13, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, v16, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v17, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v18, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, v19, v17
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v18, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s13, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v16, v20, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v0, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v16, v19, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v18
+; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v17, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v16, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s12, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v20, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v19 :: v_dual_add_nc_u32 v19, v21, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, v22, v17
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v20, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x7fff, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, v18, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v21, v22, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v19, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v18, v24, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v17.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB83_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB83_2
+; GFX11-TRUE16-NEXT: .LBB83_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v32bf16_to_v8f64_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s15, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s14, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s13, s1
+; GFX11-FAKE16-NEXT: s_mov_b32 s12, s0
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s28, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB83_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB83_4
+; GFX11-FAKE16-NEXT: .LBB83_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s27, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s27, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s26, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s26, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s25, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s25, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s24, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v9, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v5, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s23, 16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s23, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s22, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s22, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s21, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s21, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s20, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s20, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v8, v7
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s19, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s19, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v4, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s18, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v16, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v1, 16, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v6, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s18, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s17, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v2, 16, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s17, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v6, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v7, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v17, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v19, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s16, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v18, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v18, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v16, 16, 1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v0, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v17, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, v6, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v2, 16, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s15, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v5, 16, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v19
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s15, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s14, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v17, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v18, v16, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s14, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v17
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, v18, v16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s13, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v20, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v16, v17, v18 :: v_dual_add_nc_u32 v17, v19, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s13, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v21, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s12, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v20, v18, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v21
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v18
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v17, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v21, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s12, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v22, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v17, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, v19, v21
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, v20, v18
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v17
+; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v22, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, v23, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 0x7fff, v19
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, v24, v22
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v19, v26, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v24
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, 0x400000, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v23, v27, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v20, v25, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v21, v24, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v1, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v18, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v16, 16, v21
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v20, 16, v17
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB83_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB83_2
+; GFX11-FAKE16-NEXT: .LBB83_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -50954,133 +51974,98 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v18.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v53.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v50.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v31.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v34.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v19.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v27.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v19.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v18.l
; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v19.l
; GFX11-TRUE16-NEXT: s_clause 0x3
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
@@ -54638,53 +55623,52 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v20.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v27.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v80.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v68.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v70.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_3
@@ -54697,98 +55681,82 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v53.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v52.l
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v0.l, v54.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v55.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v54.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v49.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v64, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v1.h, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v50.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v50.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v49.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v29.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v29.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v51.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v64, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v2.l, v51.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v64, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v3.l, v50.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v50.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v30.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v64, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v4.l, v39.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v5.l, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v37.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v64, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v6.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v64.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v64, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v7.l, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v21.h
; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v64, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v8.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v20.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v64, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v9.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v64.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v32.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v64, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v10.l, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v54.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v51.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v51.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v53.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v30.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v39.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v39.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v48.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v48.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v24.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v26.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v27.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v19.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v18.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
@@ -54809,226 +55777,170 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v64, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v11.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v16.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v64, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v12.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v64.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v64, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v13.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v64.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v64, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v14.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v64.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v64, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v15.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v64.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v64, v15
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2
; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v52.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v53.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v50.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v50.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v49.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v29.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v27.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v23.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v37.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v35.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v33.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v31.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v31.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v55.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v54.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v53.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v49.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v52, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v39.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v51.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v51.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v52, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v52.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v50.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v50.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v52, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v24.h, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v39.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.l, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v52, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v29.h, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v30.h, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v27.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v27.h, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v6.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v52, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v37.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v23.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v23.h, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v52, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v52, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v21.l, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v21.h, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v52, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v20.h, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v52, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v33.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v19.l, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v19.h, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v18.h, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v31.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v17.h, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v52, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.h, v14.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v52, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v51.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v51.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v52.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v52.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v53.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v30.h, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v39.l, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v39.h, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v48.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.h, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v24.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v25.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.h, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v26.h, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v27.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v21.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v21.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v22.h, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v23.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v18.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v19.h, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v20.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v20.h, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v16.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.h, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v17.l, v14.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v18.l, v15.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v10.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v14.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v15.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v52, v15
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -60528,298 +61440,258 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_lshlrev_b32 v26, 16, v7
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v18, 0x40c00000, v0 :: v_dual_lshlrev_b32 v19, 16, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v9
-; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v17, 16, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_add_f32 v16, 0x40c00000, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v10
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v16, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v17, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v16, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v16
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v18
-; GFX11-TRUE16-NEXT: v_add3_u32 v21, v21, v17, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v0, v0, v16, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_add3_u32 v22, v22, v17, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v16, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v11
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_cndmask_b32 v0, v0, v22
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v1
-; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v18, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v13
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v18, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v20, 16, 1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v8
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v12
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v19, v20, vcc_lo
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v18, 16, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v1, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v21, v21, v0, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v21, v23, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v20
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v12
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 0x40c00000, v13 :: v_dual_cndmask_b32 v2, v21, v16
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v22
-; GFX11-TRUE16-NEXT: v_add3_u32 v16, v18, v20, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v19, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v20, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v16.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v22, v24, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v18, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v18
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_add3_u32 v19, v22, v18, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v17.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v19, v22, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v22, v23, v20, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v20
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v3, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v13
-; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v21, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v16, v17, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v17, v18, v19, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v19
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v21
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v22, 0x40c00000, v22 :: v_dual_add_f32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT: v_add3_u32 v18, v20, v21, 0x7fff
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v4
-; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v22, 16, 1
-; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-TRUE16-NEXT: v_add3_u32 v19, v20, v22, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v22
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v19, v19, v20, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v20, v21, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v24
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v18.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v22, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v21, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v20, v21, vcc_lo
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 0x40c00000, v23 :: v_dual_add_f32 v4, 0x40c00000, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v23, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v23
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-TRUE16-NEXT: v_add3_u32 v20, v22, v23, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v4, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v20, v21, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v21, v22, v4, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v22, 0x40c00000, v22 :: v_dual_cndmask_b32 v3, v19, v23
+; GFX11-TRUE16-NEXT: v_add3_u32 v19, v24, v21, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v21
+; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v22, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v20.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v21, v24, v4, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v19, v19, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v4
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v21, v22 :: v_dual_and_b32 v5, 0xffff0000, v5
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v24, 0x40c00000, v24 :: v_dual_add_f32 v5, 0x40c00000, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v24, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v24
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-TRUE16-NEXT: v_add3_u32 v21, v23, v24, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v5, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v21, v21, v22, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v22, v23, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v24, v25, v22, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v22
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v21, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v26
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v19.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v21, v21, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v22, v24, v25, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v7
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v26, v23, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v22, v23, vcc_lo
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 0x40c00000, v25 :: v_dual_add_f32 v6, 0x40c00000, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.h
-; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v25, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v25
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add3_u32 v22, v24, v25, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v6, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v22, v22, v23, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add3_u32 v23, v24, v6, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v24, 0x40c00000, v24 :: v_dual_cndmask_b32 v5, v21, v25
+; GFX11-TRUE16-NEXT: v_add3_u32 v21, v26, v23, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v23
+; GFX11-TRUE16-NEXT: v_bfe_u32 v26, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-TRUE16-NEXT: v_bfe_u32 v27, v24, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add3_u32 v23, v26, v6, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v21, v21, v25, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v6
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v6, v23, v24 :: v_dual_and_b32 v7, 0xffff0000, v7
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v26, 0x40c00000, v26 :: v_dual_add_f32 v7, 0x40c00000, v7
-; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v26, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v26
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add3_u32 v23, v25, v26, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v7, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add3_u32 v24, v25, v7, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_add3_u32 v26, v27, v24, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v24
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v23, v25, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v25, 0x40c00000, v28
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v23, v23, v7, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v26, v27, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v28, v25, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v7, v24, v25 :: v_dual_and_b32 v8, 0xffff0000, v8
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v27, 0x40c00000, v27 :: v_dual_add_f32 v8, 0x40c00000, v8
-; GFX11-TRUE16-NEXT: v_bfe_u32 v26, v27, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v27
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add3_u32 v24, v26, v27, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v26, v8, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v24, v25, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add3_u32 v25, v26, v8, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v26, 0x40c00000, v26 :: v_dual_cndmask_b32 v7, v23, v27
+; GFX11-TRUE16-NEXT: v_add3_u32 v23, v28, v25, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v25
+; GFX11-TRUE16-NEXT: v_bfe_u32 v28, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT: v_bfe_u32 v29, v26, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add3_u32 v25, v28, v8, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v23, v27, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v8
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v8, v25, v26 :: v_dual_and_b32 v9, 0xffff0000, v9
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v28, 0x40c00000, v28 :: v_dual_add_f32 v9, 0x40c00000, v9
-; GFX11-TRUE16-NEXT: v_bfe_u32 v27, v28, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, 0x400000, v28
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add3_u32 v25, v27, v28, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v27, v9, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v25, v25, v26, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add3_u32 v26, v27, v9, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_add3_u32 v28, v29, v26, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, 0x400000, v26
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v25, v27, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v27, 0x40c00000, v30
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v23.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v22.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v25, v25, v9, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v28, v29, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v11
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v30, v27, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v9, v26, v27 :: v_dual_and_b32 v10, 0xffff0000, v10
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v29, 0x40c00000, v29 :: v_dual_add_f32 v10, 0x40c00000, v10
-; GFX11-TRUE16-NEXT: v_bfe_u32 v28, v29, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v29
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add3_u32 v26, v28, v29, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v28, v10, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v26, v27, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add3_u32 v27, v28, v10, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, 0x400000, v10
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v28, 0x40c00000, v28 :: v_dual_cndmask_b32 v9, v25, v29
+; GFX11-TRUE16-NEXT: v_add3_u32 v25, v30, v27, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, 0x400000, v27
+; GFX11-TRUE16-NEXT: v_bfe_u32 v30, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-TRUE16-NEXT: v_bfe_u32 v31, v28, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add3_u32 v27, v30, v10, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v25, v25, v29, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, 0x400000, v10
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v10, v27, v28 :: v_dual_and_b32 v11, 0xffff0000, v11
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v30, 0x40c00000, v30 :: v_dual_add_f32 v11, 0x40c00000, v11
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v29, v30, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, 0x400000, v30
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v26
-; GFX11-TRUE16-NEXT: v_add3_u32 v27, v29, v30, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v29, v11, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v6, 16, v10
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v27, v28, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add3_u32 v28, v29, v11, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_add3_u32 v30, v31, v28, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, 0x400000, v28
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v27, v29, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v27, v11, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v32
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v25.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v27, v27, v11, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v30, v31, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v13
+; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v29, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v11, v28, v29 :: v_dual_and_b32 v12, 0xffff0000, v12
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v31, 0x40c00000, v31 :: v_dual_add_f32 v12, 0x40c00000, v12
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v11.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v30, v31, 16, 1
-; GFX11-TRUE16-NEXT: v_bfe_u32 v28, v12, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v27
-; GFX11-TRUE16-NEXT: v_add3_u32 v29, v30, v31, 0x7fff
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, 0x400000, v31
-; GFX11-TRUE16-NEXT: v_add3_u32 v28, v28, v12, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v31, v13, 16, 1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v5, 16, v11
-; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v30, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v29, v32, vcc_lo
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v27, v30, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v27, 0x40c00000, v31
+; GFX11-TRUE16-NEXT: v_add3_u32 v30, v32, v29, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, 0x400000, v29
+; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v27, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v27
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v30, v31, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v30, v32, v12, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, 0x400000, v12
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-TRUE16-NEXT: v_add3_u32 v31, v31, v13, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v22
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v14
-; GFX11-TRUE16-NEXT: v_add3_u32 v28, v34, v30, 0x7fff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v13, v31, v35 :: v_dual_add_f32 v32, 0x40c00000, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v30
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v15, 0x40c00000, v15
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v13.h
-; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v32, 16, 1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v12.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v34, 16, 1
-; GFX11-TRUE16-NEXT: v_bfe_u32 v31, v15, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v15
-; GFX11-TRUE16-NEXT: v_add3_u32 v35, v36, v32, 0x7fff
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v29
-; GFX11-TRUE16-NEXT: v_add3_u32 v31, v31, v15, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v32, v33, v27, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v14
+; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v13, 16, 1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v5, 16, v6
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v16, 16, v21
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v4, 16, v12
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v7.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v30, v31, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v33
+; GFX11-TRUE16-NEXT: v_add3_u32 v31, v35, v13, 0x7fff
; GFX11-TRUE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v23
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v14, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v4, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v21, 16, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v36, v37, v14, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v37, v38, v34, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v37, v38, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v34.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v31, v39, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, 0x400000, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v36, v48, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v15
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v14.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v35, v31, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v29.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v32, v34, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v15
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v30, 16, 1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v14
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v26.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v31, v32, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v34
+; GFX11-TRUE16-NEXT: v_add3_u32 v32, v33, v30, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v30
+; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v14, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v31
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v28, v33, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v1, 16, v14
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v28
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v9.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v25
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v3, 16, v13
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v24
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v1, 16, v9
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v3, 16, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v16, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v1, 16, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v18, 16, v20
+; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v31, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v27.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v33, v34, v14, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v31, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v31
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v15, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v15
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v34, v35, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v15, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v33, v36, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v30.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v3, 16, v19
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v32, v37, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v31.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v28.h
; GFX11-TRUE16-NEXT: .LBB94_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -62037,325 +62909,620 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v32bf16_to_v32i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s15, s3
-; GFX11-NEXT: s_mov_b32 s14, s2
-; GFX11-NEXT: s_mov_b32 s13, s1
-; GFX11-NEXT: s_mov_b32 s12, s0
-; GFX11-NEXT: s_cmp_lg_u32 s28, 0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB95_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB95_4
-; GFX11-NEXT: .LBB95_2: ; %cmp.true
-; GFX11-NEXT: s_and_b32 s0, s12, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s1, s12, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s13, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s1, s13, 16
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0
-; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v4
-; GFX11-NEXT: s_and_b32 s2, s14, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX11-NEXT: s_lshl_b32 s0, s14, 16
-; GFX11-NEXT: s_lshl_b32 s1, s27, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v7
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5
-; GFX11-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v16, v2, v6, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v9, v5
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v3
-; GFX11-NEXT: s_and_b32 s0, s15, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v7
-; GFX11-NEXT: v_bfe_u32 v7, v9, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v6
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v9
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: s_lshl_b32 s0, s15, 16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v17, v5, v6, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v7
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v9
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: s_and_b32 s0, s16, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v4
-; GFX11-NEXT: v_bfe_u32 v7, v10, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s16, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v10
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: s_and_b32 s0, s17, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v18, v6, v8 :: v_dual_add_nc_u32 v7, v9, v5
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v10
-; GFX11-NEXT: v_bfe_u32 v8, v11, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v11
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_lshl_b32 s0, s17, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v8
-; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v19, v7, v9, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v11
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: s_and_b32 s0, s18, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v6
-; GFX11-NEXT: v_bfe_u32 v9, v12, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v20, v5, v7, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s18, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v12
-; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: s_and_b32 s0, s19, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; GFX11-NEXT: v_bfe_u32 v9, v11, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v21, v7, v8, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_lshl_b32 s0, s19, 16
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v12, v7, 16, 1
-; GFX11-NEXT: v_dual_cndmask_b32 v5, v8, v10 :: v_dual_add_nc_u32 v8, 0x7fff, v9
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v11
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: s_and_b32 s0, s20, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v12, v7
-; GFX11-NEXT: v_bfe_u32 v12, v13, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v7
-; GFX11-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s20, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v12, v13
-; GFX11-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v14, v9, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: s_and_b32 s0, s21, 0xffff0000
-; GFX11-NEXT: v_dual_cndmask_b32 v7, v10, v11 :: v_dual_add_nc_u32 v10, 0x7fff, v12
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v14, v9
-; GFX11-NEXT: v_bfe_u32 v14, v15, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v13, v14, v15
-; GFX11-NEXT: v_cndmask_b32_e32 v22, v10, v11, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v12
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v9
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: s_lshl_b32 s0, s21, 16
-; GFX11-NEXT: v_bfe_u32 v14, v10, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v23, 0x40c00000, s0
-; GFX11-NEXT: v_dual_cndmask_b32 v24, v11, v12 :: v_dual_add_nc_u32 v9, 0x7fff, v13
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v15
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-NEXT: s_and_b32 s0, s22, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v14, v10
-; GFX11-NEXT: v_bfe_u32 v13, v23, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v10
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s22, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v13, v13, v23
-; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v15, v11, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: s_and_b32 s0, s23, 0xffff0000
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v13
-; GFX11-NEXT: v_bfe_u32 v13, v25, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v11
-; GFX11-NEXT: v_cndmask_b32_e32 v26, v12, v14, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v15, v11
-; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v23
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-NEXT: v_add_nc_u32_e32 v13, v13, v25
-; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
-; GFX11-NEXT: v_bfe_u32 v28, v14, 16, 1
-; GFX11-NEXT: s_lshl_b32 s0, s23, 16
-; GFX11-NEXT: v_cndmask_b32_e32 v23, v10, v15, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v13
-; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v25
-; GFX11-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s24, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v10, v12, v27, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v28, v14
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT: v_add_f32_e64 v27, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v28, 0x400000, v14
-; GFX11-NEXT: v_bfe_u32 v29, v15, 16, 1
-; GFX11-NEXT: v_dual_cndmask_b32 v11, v11, v13 :: v_dual_add_nc_u32 v12, 0x7fff, v12
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT: s_lshl_b32 s0, s24, 16
-; GFX11-NEXT: v_bfe_u32 v13, v27, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s25, 0xffff0000
-; GFX11-NEXT: v_dual_cndmask_b32 v25, v12, v28 :: v_dual_add_nc_u32 v12, v29, v15
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v13, v13, v27
-; GFX11-NEXT: v_add_f32_e64 v28, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v29, 0x400000, v15
-; GFX11-NEXT: v_bfe_u32 v30, v14, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
-; GFX11-NEXT: v_or_b32_e32 v31, 0x400000, v27
-; GFX11-NEXT: v_bfe_u32 v32, v28, 16, 1
-; GFX11-NEXT: v_dual_cndmask_b32 v12, v12, v29 :: v_dual_add_nc_u32 v15, v30, v14
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX11-NEXT: v_or_b32_e32 v30, 0x400000, v14
-; GFX11-NEXT: s_lshl_b32 s0, s25, 16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
-; GFX11-NEXT: v_add_f32_e64 v29, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v27, v13, v31, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v13, v32, v28
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT: v_or_b32_e32 v31, 0x400000, v28
-; GFX11-NEXT: s_and_b32 s0, s26, 0xffff0000
-; GFX11-NEXT: v_bfe_u32 v32, v29, 16, 1
-; GFX11-NEXT: v_dual_cndmask_b32 v14, v15, v30 :: v_dual_add_nc_u32 v13, 0x7fff, v13
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-NEXT: v_add_f32_e64 v33, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s26, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v15, v32, v29
-; GFX11-NEXT: v_add_f32_e64 v30, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v13, v13, v31, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v31, 0x40c00000, s1
-; GFX11-NEXT: s_and_b32 s0, s27, 0xffff0000
-; GFX11-NEXT: v_bfe_u32 v28, v33, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v34, v30, 16, 1
-; GFX11-NEXT: v_bfe_u32 v35, v31, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v31
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-NEXT: v_bfe_u32 v37, v32, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v30
-; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v31
-; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v30
-; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v37, v37, v32
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
-; GFX11-NEXT: v_add_nc_u32_e32 v28, v28, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37
-; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v29
-; GFX11-NEXT: v_cndmask_b32_e32 v31, v35, v39, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28
-; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v33
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT: v_cndmask_b32_e32 v30, v34, v48, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; GFX11-NEXT: v_cndmask_b32_e32 v32, v37, v49, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-NEXT: v_and_or_b32 v7, 0xffff0000, v7, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v29, v15, v36, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT: v_and_or_b32 v15, 0xffff0000, v32, v31
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v11
-; GFX11-NEXT: v_cndmask_b32_e32 v28, v28, v38, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29
-; GFX11-NEXT: v_and_or_b32 v12, 0xffff0000, v27, v31
-; GFX11-NEXT: v_and_or_b32 v11, 0xffff0000, v25, v32
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v9
-; GFX11-NEXT: v_and_or_b32 v14, 0xffff0000, v28, v30
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v6
-; GFX11-NEXT: v_and_or_b32 v9, 0xffff0000, v26, v23
-; GFX11-NEXT: v_and_or_b32 v8, 0xffff0000, v24, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v2
-; GFX11-NEXT: v_and_or_b32 v6, 0xffff0000, v5, v27
-; GFX11-NEXT: v_and_or_b32 v5, 0xffff0000, v21, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v1
-; GFX11-NEXT: v_and_or_b32 v13, 0xffff0000, v13, v29
-; GFX11-NEXT: v_and_or_b32 v10, 0xffff0000, v10, v33
-; GFX11-NEXT: v_and_or_b32 v4, 0xffff0000, v19, v20
-; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v18, v21
-; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v17, v22
-; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v16, v23
-; GFX11-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v24
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB95_3:
-; GFX11-NEXT: s_branch .LBB95_2
-; GFX11-NEXT: .LBB95_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
-; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
-; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v32i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s13, s1
+; GFX11-TRUE16-NEXT: s_mov_b32 s12, s0
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s28, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB95_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB95_4
+; GFX11-TRUE16-NEXT: .LBB95_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s12, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s12, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s13, 0xffff0000
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s25, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s13, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v2
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s14, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v10, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s14, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v16.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v6, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s15, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s15, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v2, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v2, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s16, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v18, v3, v7 :: v_dual_add_nc_u32 v5, v5, v10
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s16, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v10
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s17, 0xffff0000
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v11, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s17, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v11
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v19, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v4, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v8
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s18, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v5, v9 :: v_dual_add_nc_u32 v5, 0x7fff, v10
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v9, v12
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s18, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v5, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v13, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v20.h
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v6, v11 :: v_dual_add_nc_u32 v6, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s19, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v13
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s19, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v21, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v6, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v10
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s20, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v10
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v6, v7, v11 :: v_dual_add_nc_u32 v7, 0x7fff, v12
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v14, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s20, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v14
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v15, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v22, v7, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v14
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v9, v15
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v22.h
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v7, v8, v13 :: v_dual_add_nc_u32 v8, 0x7fff, v11
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s21, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s21, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v11, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v15
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v24, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v8, v11
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s22, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v10, v12
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v8, v9, v13 :: v_dual_add_nc_u32 v9, 0x7fff, v14
+; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v24, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, v13, v24
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s22, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v9, v14, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v25, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v24
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v23.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v26.h
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v9, v10, v15 :: v_dual_add_nc_u32 v10, 0x7fff, v13
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s23, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v25
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s23, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v10, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v13, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v25
+; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v14, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v28, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, v10, v13
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s24, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, 0x400000, v14
+; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v28, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v15, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v27
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v27, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, v25, v28
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s24, 16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v12, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v27, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v25
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v28
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v29, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v12, v27
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s25, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, 0x400000, v27
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v13, v14, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v15, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v25, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v31, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, v13, v15
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, 0x400000, v25
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v14, v25
+; GFX11-TRUE16-NEXT: v_bfe_u32 v27, v31, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v12, v29, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, 0x400000, v15
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, v27, v31
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s26, 0xffff0000
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v28.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v13, v29, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s26, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v29.h
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v13, v14, v32 :: v_dual_add_nc_u32 v14, 0x7fff, v27
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v27, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s27, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, 0x400000, v31
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v33, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v27, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s27, 0xffff0000
+; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v15, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v33, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v14, v32, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v34, v27
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v35, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v27
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, v36, v33
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, v25, v15
+; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v35, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v14, v37, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x7fff, v25
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, v32, v35
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v34, v38, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v31.h
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v17.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v25, v36, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v27.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v32, v37, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v33.h
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB95_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB95_2
+; GFX11-TRUE16-NEXT: .LBB95_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v32bf16_to_v32i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s15, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s14, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s13, s1
+; GFX11-FAKE16-NEXT: s_mov_b32 s12, s0
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s28, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB95_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB95_4
+; GFX11-FAKE16-NEXT: .LBB95_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s12, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s12, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s13, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s13, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v4
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s14, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s14, 16
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s27, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v2, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v9, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v3
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s15, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v9, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v9
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s15, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s16, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v10
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s17, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v6, v8 :: v_dual_add_nc_u32 v7, v9, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s17, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s18, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v12, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v5, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s18, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v12
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s19, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s19, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v8, v10 :: v_dual_add_nc_u32 v8, 0x7fff, v9
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s20, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v12, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s20, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v12, v13
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s21, 0xffff0000
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v10, v11 :: v_dual_add_nc_u32 v10, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v14, v9
+; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v15, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v14, v15
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v10, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s21, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v23, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v24, v11, v12 :: v_dual_add_nc_u32 v9, 0x7fff, v13
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v15
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s22, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v14, v10
+; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v23, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s22, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v13, v23
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s23, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v13
+; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v25, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v12, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v15, v11
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v23
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v13, v25
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v14, 16, 1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s23, 16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v10, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v13
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v25
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v12, v27, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v28, v14
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v27, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v14
+; GFX11-FAKE16-NEXT: v_bfe_u32 v29, v15, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v11, v13 :: v_dual_add_nc_u32 v12, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s24, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v27, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s25, 0xffff0000
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v25, v12, v28 :: v_dual_add_nc_u32 v12, v29, v15
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v13, v27
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v28, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v15
+; GFX11-FAKE16-NEXT: v_bfe_u32 v30, v14, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, 0x400000, v27
+; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v28, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v12, v12, v29 :: v_dual_add_nc_u32 v15, v30, v14
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, 0x400000, v14
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s25, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v29, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v27, v13, v31, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v32, v28
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, 0x400000, v28
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s26, 0xffff0000
+; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v29, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v15, v30 :: v_dual_add_nc_u32 v13, 0x7fff, v13
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v33, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s26, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, v32, v29
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v30, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v31, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v31, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s27, 0xffff0000
+; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v33, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v32, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v30, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v31, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v31
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v32, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v34, v30
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v35, v35, v31
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v30
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v37, v32
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, v28, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v29
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v35, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v33
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v34, v48, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v37, v49, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT: v_and_or_b32 v7, 0xffff0000, v7, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v15, v36, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT: v_and_or_b32 v15, 0xffff0000, v32, v31
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v28, v38, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX11-FAKE16-NEXT: v_and_or_b32 v12, 0xffff0000, v27, v31
+; GFX11-FAKE16-NEXT: v_and_or_b32 v11, 0xffff0000, v25, v32
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v9
+; GFX11-FAKE16-NEXT: v_and_or_b32 v14, 0xffff0000, v28, v30
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v6
+; GFX11-FAKE16-NEXT: v_and_or_b32 v9, 0xffff0000, v26, v23
+; GFX11-FAKE16-NEXT: v_and_or_b32 v8, 0xffff0000, v24, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v6, 0xffff0000, v5, v27
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, 0xffff0000, v21, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v1
+; GFX11-FAKE16-NEXT: v_and_or_b32 v13, 0xffff0000, v13, v29
+; GFX11-FAKE16-NEXT: v_and_or_b32 v10, 0xffff0000, v10, v33
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, 0xffff0000, v19, v20
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v18, v21
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v17, v22
+; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v16, v23
+; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v24
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB95_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB95_2
+; GFX11-FAKE16-NEXT: .LBB95_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -64107,133 +65274,98 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v18.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v53.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v50.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v31.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v34.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v19.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v27.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v19.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v18.l
; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v19.l
; GFX11-TRUE16-NEXT: s_clause 0x3
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
@@ -70432,170 +71564,106 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s17, 8
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s6, s16, 0xff
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s18, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s19, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7
+; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s20, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s21, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s10, s22, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s23, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s8, s9
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s24, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s25, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s10, s26, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s27, 8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v35
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v39
-; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff
-; GFX11-TRUE16-NEXT: v_and_b32_e64 v1, 0xffff, s10
-; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v38
-; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v31
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_and_b32 v1, 0xff, v38
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v39
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v31
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v48
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v82
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v49
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v51
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v5, v50
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v6, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v53
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v24
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v7, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v18
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v8, 16, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v54
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v67
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v19
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v8, v23
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v2, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v3, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v30
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v80
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v82
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v2, v49
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v50
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v36
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v52
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v34
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v53
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v20
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v17
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v18
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v3, v54
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v19
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v1, v55
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v27
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v10
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v21
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v26
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v3, v23
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v65
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v66
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v70
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v12, v71
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v69
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v83
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v64
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v84
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v65
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v25
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v29
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v87, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v96, v12, v81
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v97, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v86, v86, v85
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v0, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v1, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v15, 16, v87
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v96, 16, v97
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v86, 16, v98
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v30
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v2, v27
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v29
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v68
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v1, v66
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v67
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v69
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v70
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v64
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v80
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v71
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v81
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v84
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v85
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v3, v83
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s5
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB99_3
; GFX11-TRUE16-NEXT: .LBB99_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v68
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v67
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v30
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v65
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v16
; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v70, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v71, v5
-; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff
; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v66, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v28
; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v29, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v24
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v27, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v25, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v23, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v34
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v55, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v19, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v54, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v17, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v36
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v53, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v33
; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5
; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8
@@ -70604,11 +71672,6 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v31
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v37
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v7
; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7
; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8
@@ -70626,83 +71689,133 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8
; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v51, v4
; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10
; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0
; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2
-; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v50, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v49, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v35
+; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v64
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v82
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v80
+; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v64
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v69
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v80
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v68
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v67
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v84, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v85, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v28
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v83, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v30
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v22
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v69
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v70, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v65
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_add_nc_u32 v13, 0x300, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v66, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v26
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v48, v8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4
-; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v85, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v71, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v29, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v25, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v23, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v20
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v34
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v55, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v32
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v19, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v17, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v53, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v33
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v31
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v19
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v38
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v51, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v50, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v49, v19
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 3, v35
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v19
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v48, v20
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v23
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v81, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e64 v8, 0xffff, s4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v24.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v19
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v9, 16, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v23, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v22, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v19, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v20
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v18, 16, v15
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v14, 16, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v20.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_add_nc_u32 v19, 0x300, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v17.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v18.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v3.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3
; GFX11-TRUE16-NEXT: .LBB99_3: ; %end
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -74458,358 +75571,687 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg %
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v32bf16_to_v32f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s15, s3
-; GFX11-NEXT: s_mov_b32 s14, s2
-; GFX11-NEXT: s_mov_b32 s13, s1
-; GFX11-NEXT: s_mov_b32 s12, s0
-; GFX11-NEXT: s_cmp_lg_u32 s28, 0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB103_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB103_4
-; GFX11-NEXT: .LBB103_2: ; %cmp.true
-; GFX11-NEXT: s_and_b32 s0, s12, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s1, s12, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s13, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s1, s13, 16
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v4
-; GFX11-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT: s_and_b32 s2, s14, 0xffff0000
-; GFX11-NEXT: s_lshl_b32 s0, s14, 16
-; GFX11-NEXT: s_and_b32 s1, s27, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s2
-; GFX11-NEXT: v_add_f32_e64 v33, 0x40c00000, s1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v9, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s15, 0xffff0000
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_add_nc_u32 v3, 0x7fff, v3
-; GFX11-NEXT: v_bfe_u32 v6, v7, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s15, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v2
-; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v33
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v3, v4 :: v_dual_add_nc_u32 v3, v6, v7
-; GFX11-NEXT: v_bfe_u32 v4, v8, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v7
-; GFX11-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v8
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v5
-; GFX11-NEXT: v_bfe_u32 v36, v33, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v3
-; GFX11-NEXT: v_bfe_u32 v3, v6, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s16, 0xffff0000
-; GFX11-NEXT: v_dual_cndmask_b32 v4, v4, v7 :: v_dual_add_nc_u32 v7, 0x7fff, v9
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v9, v3, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_lshl_b32 s0, s16, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v9
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v6
-; GFX11-NEXT: v_bfe_u32 v9, v4, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v5
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: s_and_b32 s0, s17, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v9, v4
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v5
-; GFX11-NEXT: v_bfe_u32 v10, v9, 16, 1
-; GFX11-NEXT: s_lshl_b32 s0, s17, 16
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v4, v6, v7 :: v_dual_add_nc_u32 v7, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s18, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v9
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v9
-; GFX11-NEXT: v_bfe_u32 v11, v6, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v5
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s18, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v11, v6
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v7, v7, v8 :: v_dual_add_nc_u32 v8, 0x7fff, v10
-; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_lshl_or_b32 v4, v4, 16, v20
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v7
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s19, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5
-; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v9, v7, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_lshl_b32 s0, s19, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
-; GFX11-NEXT: v_bfe_u32 v12, v10, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v11, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v12, v10
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_bfe_u32 v13, v8, 16, 1
-; GFX11-NEXT: s_and_b32 s0, s20, 0xffff0000
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v10
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v13, v8
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s20, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v9, v9, v11 :: v_dual_add_nc_u32 v10, 0x7fff, v12
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v8
-; GFX11-NEXT: v_bfe_u32 v12, v13, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v9
-; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s21, 0xffff0000
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v13
-; GFX11-NEXT: v_cndmask_b32_e32 v8, v10, v11, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v12, v13
-; GFX11-NEXT: v_bfe_u32 v11, v9, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: s_lshl_b32 s0, s21, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v9
-; GFX11-NEXT: v_bfe_u32 v15, v12, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s22, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v10, v10, v14 :: v_dual_add_nc_u32 v11, 0x7fff, v11
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v15, v15, v12
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v10
-; GFX11-NEXT: v_bfe_u32 v10, v13, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v11, v14, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v15
-; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v12
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v13
-; GFX11-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s22, 16
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v13
-; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT: v_bfe_u32 v14, v15, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v11
-; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s23, 0xffff0000
-; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v12, v14, v15
-; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v13, v11, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v12
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v15
-; GFX11-NEXT: v_add_nc_u32_e32 v13, v13, v11
-; GFX11-NEXT: v_bfe_u32 v26, v14, 16, 1
-; GFX11-NEXT: s_lshl_b32 s0, s23, 16
-; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v11
-; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v26, v26, v14
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT: s_and_b32 s0, s24, 0xffff0000
-; GFX11-NEXT: v_bfe_u32 v27, v12, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v26
-; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v14
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v26, v27, v12
-; GFX11-NEXT: v_add_f32_e64 v27, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s24, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v13, v13, v15 :: v_dual_add_nc_u32 v14, 0x7fff, v26
-; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v12
-; GFX11-NEXT: v_bfe_u32 v26, v27, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v13
-; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s25, 0xffff0000
-; GFX11-NEXT: v_or_b32_e32 v29, 0x400000, v27
-; GFX11-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v14, v26, v27
-; GFX11-NEXT: v_bfe_u32 v15, v13, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v26, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX11-NEXT: s_lshl_b32 s0, s25, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v15, v15, v13
-; GFX11-NEXT: v_bfe_u32 v30, v26, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v27, 0x40c00000, s0
-; GFX11-NEXT: s_and_b32 s0, s26, 0xffff0000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v14, v14, v29 :: v_dual_add_nc_u32 v15, 0x7fff, v15
-; GFX11-NEXT: v_or_b32_e32 v29, 0x400000, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v30, v30, v26
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v14
-; GFX11-NEXT: v_bfe_u32 v14, v27, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX11-NEXT: v_cndmask_b32_e32 v13, v15, v29, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v30
-; GFX11-NEXT: v_or_b32_e32 v29, 0x400000, v26
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-NEXT: v_add_f32_e64 v30, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s27, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v14, v14, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; GFX11-NEXT: v_cndmask_b32_e32 v15, v15, v29, vcc_lo
-; GFX11-NEXT: v_add_f32_e64 v29, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s26, 16
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
-; GFX11-NEXT: v_add_f32_e64 v35, 0x40c00000, s0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX11-NEXT: v_bfe_u32 v34, v29, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v29
-; GFX11-NEXT: v_bfe_u32 v26, v30, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v14, v14, v32, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v27, v34, v29
-; GFX11-NEXT: v_bfe_u32 v32, v35, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v34, v36, v33
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-NEXT: v_add_nc_u32_e32 v26, v26, v30
-; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27
-; GFX11-NEXT: v_add_nc_u32_e32 v32, v32, v35
-; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v30
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v27, v27, v37 :: v_dual_add_nc_u32 v26, 0x7fff, v26
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT: v_add_nc_u32_e32 v29, 0x7fff, v32
-; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v35
-; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v27
-; GFX11-NEXT: v_cndmask_b32_e32 v33, v34, v38, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT: v_cndmask_b32_e32 v29, v29, v32, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v33
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v15
-; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29
-; GFX11-NEXT: v_cndmask_b32_e32 v26, v26, v36, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v15, v30, 16, v27
-; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v13
-; GFX11-NEXT: v_lshl_or_b32 v11, v28, 16, v33
-; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v26
-; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v14
-; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v7
-; GFX11-NEXT: v_lshl_or_b32 v12, v31, 16, v30
-; GFX11-NEXT: v_lshl_or_b32 v10, v10, 16, v34
-; GFX11-NEXT: v_lshl_or_b32 v14, v26, 16, v27
-; GFX11-NEXT: v_lshl_or_b32 v13, v32, 16, v29
-; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v9
-; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v8
-; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v6
-; GFX11-NEXT: v_lshl_or_b32 v6, v5, 16, v28
-; GFX11-NEXT: v_lshl_or_b32 v9, v24, 16, v25
-; GFX11-NEXT: v_lshl_or_b32 v8, v23, 16, v26
-; GFX11-NEXT: v_lshl_or_b32 v7, v22, 16, v27
-; GFX11-NEXT: v_lshl_or_b32 v5, v21, 16, v29
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v2
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v1
-; GFX11-NEXT: v_lshl_or_b32 v3, v18, 16, v19
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v2, v17, 16, v21
-; GFX11-NEXT: v_lshl_or_b32 v1, v16, 16, v22
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v23
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB103_3:
-; GFX11-NEXT: s_branch .LBB103_2
-; GFX11-NEXT: .LBB103_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
-; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
-; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v32f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s13, s1
+; GFX11-TRUE16-NEXT: s_mov_b32 s12, s0
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s28, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB103_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB103_4
+; GFX11-TRUE16-NEXT: .LBB103_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s12, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s12, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s13, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s13, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s14, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s14, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s25, 0xffff0000
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v6 :: v_dual_add_nc_u32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v9, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v29, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s27, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v6, v2 :: v_dual_add_nc_u32 v2, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s15, 0xffff0000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v33, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v4, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s15, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, 0x400000, v29
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s16, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s16, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v33, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v17.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v2, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, v34, v33
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v18.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v6, v7 :: v_dual_add_nc_u32 v4, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s17, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v4, v6 :: v_dual_add_nc_u32 v4, v7, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s17, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v10, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v7, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s18, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v5, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v10
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v4, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v5
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s18, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v10
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v11, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s19, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v6, v8 :: v_dual_add_nc_u32 v6, v9, v11
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s19, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v10
+; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v12, v8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v9, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s20, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v7, v7, v11 :: v_dual_add_nc_u32 v10, 0x7fff, v12
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v6, v9
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v7
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s20, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v12
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v13, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s21, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v22.l
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v7, v8, v10 :: v_dual_add_nc_u32 v8, v11, v13
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s21, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v9, v12
+; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v14, v10
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v11, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s22, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v9, v9, v13 :: v_dual_add_nc_u32 v12, 0x7fff, v14
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v8, v11
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s22, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v14
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v12, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v15, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s23, 0xffff0000
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v14, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v24.l
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v9, v10, v12 :: v_dual_add_nc_u32 v10, v13, v15
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v15
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s23, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v14
+; GFX11-TRUE16-NEXT: v_bfe_u32 v26, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, v26, v12
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v13, 16, 1
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s24, 0xffff0000
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v11, v11, v15 :: v_dual_add_nc_u32 v14, 0x7fff, v26
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, v10, v13
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v28, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s24, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v26
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v26, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v14, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v28, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s25, 16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v27.l
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v12, v12, v14 :: v_dual_add_nc_u32 v13, v15, v28
+; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v26, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v29, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v25.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v28
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v14, v26
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, v15, v29
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, 0x400000, v26
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v30.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s26, 0xffff0000
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
+; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v13, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v26, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v14, v28, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s26, 16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, v32, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v14
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v15, v31, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v31, v26, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v28
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, v31, v26
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v31, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s27, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v29.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v31, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v14, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, v15, v31
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v14
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v28, v35, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v33
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, v36, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v23.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v33
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v15, v36, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v19.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v15
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v28, v37, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v16.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v28
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v31.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB103_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB103_2
+; GFX11-TRUE16-NEXT: .LBB103_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v32bf16_to_v32f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s15, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s14, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s13, s1
+; GFX11-FAKE16-NEXT: s_mov_b32 s12, s0
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s28, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB103_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB103_4
+; GFX11-FAKE16-NEXT: .LBB103_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s12, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s12, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s13, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s13, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s14, 0xffff0000
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s14, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s27, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v33, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v9, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s15, 0xffff0000
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_add_nc_u32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s15, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v33
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v3, v4 :: v_dual_add_nc_u32 v3, v6, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v33, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v6, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xffff0000
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v4, v7 :: v_dual_add_nc_u32 v7, 0x7fff, v9
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v3, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s16, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v9
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s17, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v9, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v9, 16, 1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s17, 16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v6, v7 :: v_dual_add_nc_u32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s18, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v10, v9
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s18, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v11, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v7, v8 :: v_dual_add_nc_u32 v8, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v20
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s19, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v5
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s19, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v12, v10
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v8, 16, 1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s20, 0xffff0000
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v13, v8
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s20, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v9, v11 :: v_dual_add_nc_u32 v10, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v9
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s21, 0xffff0000
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v10, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v12, v13
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s21, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v11, v9
+; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v12, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s22, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v10, v10, v14 :: v_dual_add_nc_u32 v11, 0x7fff, v11
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, v15, v12
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v10
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v11, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v15
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v10, v13
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s22, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v15, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s23, 0xffff0000
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v14, v15
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v15
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v13, v11
+; GFX11-FAKE16-NEXT: v_bfe_u32 v26, v14, 16, 1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s23, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, v26, v14
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xffff0000
+; GFX11-FAKE16-NEXT: v_bfe_u32 v27, v12, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v26
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v14
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, v27, v12
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v27, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s24, 16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v13, v13, v15 :: v_dual_add_nc_u32 v14, 0x7fff, v26
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_bfe_u32 v26, v27, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v13
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s25, 0xffff0000
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v27
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v26, v27
+; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v26, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s25, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, v15, v13
+; GFX11-FAKE16-NEXT: v_bfe_u32 v30, v26, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v27, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s26, 0xffff0000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v29 :: v_dual_add_nc_u32 v15, 0x7fff, v15
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, v30, v26
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v14
+; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v27, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, 0x400000, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v15, v29, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v30
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v26
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v30, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s27, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v14, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v15, v29, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v29, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s26, 16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v35, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v29, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v29
+; GFX11-FAKE16-NEXT: v_bfe_u32 v26, v30, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v14, v32, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v34, v29
+; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v36, v33
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, v26, v30
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v32, v35
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v30
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v27, v27, v37 :: v_dual_add_nc_u32 v26, 0x7fff, v26
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v29, 0x7fff, v32
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, 0x400000, v35
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v34, v38, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v29, v32, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v33
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v26, v36, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v30, 16, v27
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v28, 16, v33
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v31, 16, v30
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v34
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v26, 16, v27
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v32, 16, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v5, 16, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v24, 16, v25
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v23, 16, v26
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v22, 16, v27
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v21, 16, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v18, 16, v19
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v17, 16, v21
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v16, 16, v22
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v23
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB103_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB103_2
+; GFX11-FAKE16-NEXT: .LBB103_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -76401,133 +77843,98 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v18.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v53.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v50.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v31.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v34.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v19.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v27.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v19.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v18.l
; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v19.l
; GFX11-TRUE16-NEXT: s_clause 0x3
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
@@ -82430,170 +83837,106 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s17, 8
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s6, s16, 0xff
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s18, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s19, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7
+; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s20, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s21, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s10, s22, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s23, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s8, s9
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s24, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s25, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s10, s26, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s27, 8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v35
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v39
-; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff
-; GFX11-TRUE16-NEXT: v_and_b32_e64 v1, 0xffff, s10
-; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v38
-; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v31
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_and_b32 v1, 0xff, v38
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v39
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v31
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v48
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v82
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v49
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v51
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v5, v50
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v6, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v53
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v24
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v7, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v18
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v8, 16, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v54
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v67
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v19
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v8, v23
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v2, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v3, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v30
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v80
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v82
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v2, v49
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v50
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v36
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v52
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v34
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v53
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v20
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v17
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v18
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v3, v54
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v19
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v1, v55
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v27
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v10
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v21
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v26
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v3, v23
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v65
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v66
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v70
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v12, v71
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v69
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v83
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v64
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v84
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v65
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v25
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v29
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v87, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v96, v12, v81
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v97, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v86, v86, v85
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v0, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v1, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v15, 16, v87
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v96, 16, v97
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v86, 16, v98
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v30
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v2, v27
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v29
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v68
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v1, v66
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v67
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v69
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v70
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v64
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v80
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v71
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v81
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v84
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v85
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v3, v83
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s5
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB107_3
; GFX11-TRUE16-NEXT: .LBB107_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v68
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v67
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v30
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v65
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v16
; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v70, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v71, v5
-; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff
; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v66, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v28
; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v29, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v24
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v27, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v25, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v23, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v34
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v55, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v19, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v54, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v17, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v36
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v53, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v33
; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5
; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8
@@ -82602,11 +83945,6 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v31
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v37
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v7
; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7
; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8
@@ -82624,83 +83962,133 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8
; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v51, v4
; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10
; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0
; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2
-; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v50, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v49, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v35
+; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v64
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v82
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v80
+; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v64
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v69
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v80
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v68
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v67
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v84, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v85, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v28
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v83, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v30
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v22
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v69
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v70, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v65
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_add_nc_u32 v13, 0x300, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v66, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v26
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v48, v8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4
-; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v85, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v71, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v29, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v25, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v23, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v20
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v34
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v55, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v32
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v19, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v17, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v53, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v33
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v31
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v19
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v38
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v51, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v50, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v49, v19
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 3, v35
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v19
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v48, v20
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v23
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v81, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e64 v8, 0xffff, s4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v24.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v19
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v9, 16, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v23, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v22, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v19, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v20
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v18, 16, v15
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v14, 16, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v20.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_add_nc_u32 v19, 0x300, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v17.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v18.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v3.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3
; GFX11-TRUE16-NEXT: .LBB107_3: ; %end
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -85053,57 +86441,57 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -85111,29 +86499,29 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[7:8]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[5:6]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[29:30], 24, v[15:16]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[11:12]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[3:4]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[5:6]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[29:30], 24, v[15:16]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[9:10]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[3:4]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v13
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v9
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v3
@@ -85141,11 +86529,11 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v1
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[30:31], 24, v[13:14]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[9:10]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[1:2]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[7:8]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v1.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v2.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v3.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v3.h
@@ -85155,26 +86543,26 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v5.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v6.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.h, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.h, v7.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v7.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.h, v8.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.h, v9.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v9.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.h, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.h, v10.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.h, v11.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.h, v11.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.h, v12.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v112.h, v13.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.h, v13.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.h, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.h, v14.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v14.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.h, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.h, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v16.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.h, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v15.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v115.h, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v16.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5
@@ -85187,71 +86575,72 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v4
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_lshlrev_b32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v18
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_add_f32 v20, 0x40c00000, v18
; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v20, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v17
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX11-TRUE16-NEXT: v_add3_u32 v18, v18, v17, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v17, v24, v20, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v52, v18, v23 :: v_dual_lshlrev_b32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v39, v18, v23 :: v_dual_and_b32 v2, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1
; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v2, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v20, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_add3_u32 v21, v21, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, 0x400000, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add3_u32 v17, v24, v20, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v21, v21, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX11-TRUE16-NEXT: v_add3_u32 v23, v25, v1, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v21, v22, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v19
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v20
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v52.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v53, v23, v26, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v4
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v17, v19, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v4, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v4
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v20
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v22, 0x40c00000, v20 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v4, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v53.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v22, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v18
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v54, v19, v21, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v22
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v22, 0x7fff
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v18
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v1, v20, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v54.h
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v18
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v17
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v3, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v55, v1, v19 :: v_dual_and_b32 v2, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v55, v1, v19, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v54.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v19, v4, v21, vcc_lo
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
@@ -85304,305 +86693,266 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v10
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v65.h
; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v66, v4, v5 :: v_dual_lshlrev_b32 v5, 16, v10
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v66, v4, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v7
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v7 :: v_dual_lshlrev_b32 v5, 16, v10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v21
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v1, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v66.h
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v4, 0x7fff
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[21:22]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[19:20]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[17:18]
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v1, v7, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v5, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v22
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v68, v3, v8 :: v_dual_and_b32 v3, 0xffff0000, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v67, v3, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v5, 16, v12
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v9
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v68, v1, v4 :: v_dual_add_f32 v5, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v6, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v67, v1, v4, vcc_lo
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v9
-; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v6, 0x7fff
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v6, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v66.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v2, v7, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v13
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v68.h
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v67.h
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v67.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v23
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v25, v2, v6, vcc_lo
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v7, v1, 0x7fff
; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[21:22]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v68.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v6, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v82, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v80, v2, v3, vcc_lo
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v8 :: v_dual_lshlrev_b32 v5, 16, v14
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v82.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[17:18]
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v80, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v11
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v71, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v11
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v6, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 24, v26
; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v7, v4, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v6, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v26
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v80.h
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v26
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v3, v7, vcc_lo
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v6
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v13
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v80.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 24, v24
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v71.h
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v25
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v27, v2, v3 :: v_dual_add_f32 v2, 0x40c00000, v4
; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v7, v1, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v6
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v28
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v97, v4, v5, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v4, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v96, v4, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v4, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v8, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v28
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v4, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v13
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v8, v1, 0x7fff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v15
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v97.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v87, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v98, v6, v7 :: v_dual_and_b32 v5, 0xffff0000, v16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v9
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[27:28]
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v87.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v15
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v15
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v96.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v3, v4, vcc_lo
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v16
; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[25:26]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v33
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v33
-; GFX11-TRUE16-NEXT: v_add3_u32 v4, v6, v2, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v27
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v23
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v112, v4, v6 :: v_dual_add_f32 v1, 0x40c00000, v5
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v8 :: v_dual_lshlrev_b32 v5, 16, v15
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v112.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v32
-; GFX11-TRUE16-NEXT: v_add3_u32 v4, v7, v1, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v98.h
; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add3_u32 v4, v6, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[27:28]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[25:26]
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v113, v4, v6 :: v_dual_add_f32 v6, 0x40c00000, v8
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v4, v7, v1, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v103, v2, v9, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v115, v2, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v113.h
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[23:24]
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v10, v6, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v113, v7, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v33
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v33
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v32
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v27
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v117, v7, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v38, v4, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v39, v4, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v103.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v37, v2, v3, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v113.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v115.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v38, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v117.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 24, v38
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v38
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[29:30], 24, v[37:38]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 24, v39
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v39
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[29:30], 24, v[38:39]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[30:31], 24, v[32:33]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v37
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v38
; GFX11-TRUE16-NEXT: .LBB108_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v131.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, 0
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v50.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v1.l, v1.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v129.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v55.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v39.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v52.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v130.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v19.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v31, v1
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v128.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v54.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v119.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v2.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v128.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v118.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v21.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v31, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v49.l
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v54.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v119.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v115.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v31, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v118.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v65.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v116.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v64.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v114.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v112.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.h, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v7.h, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v8.h, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v103.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v4.l, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v65.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v117.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v101.l
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v64.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v116.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v31, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v68.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v114.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v98.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v31, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v34.l
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v36.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v102.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v101.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v80.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v100.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v66.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v102.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v10.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v25.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v35.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v68.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v99.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v26.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v97.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v96.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v87.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v34.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.h, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.h, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v13.h, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v86.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v28.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v85.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v31, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v82.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v100.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v30.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v31, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v81.l
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v67.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v99.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v29.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v31, v9
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.l, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v97.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v96.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v69.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v31, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v80.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v86.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v31, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v113.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v84.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v98.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v83.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h
; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v112.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v84.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v31, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v14.l, v14.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v87.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v83.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v31, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v15.l, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v113.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v71.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v31, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v103.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v70.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v31, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v17.l, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v31, v16
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v14.l, v14.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v15.l, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.l, v16.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v82.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v117.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v81.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v38.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v115.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v70.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v69.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.h, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.h, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v17.h, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v18.h, v19.l
; GFX11-TRUE16-NEXT: s_clause 0x3
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
@@ -92726,170 +94076,106 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s17, 8
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s6, s16, 0xff
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s18, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s19, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7
+; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s20, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s21, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s10, s22, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s23, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s8, s9
+; GFX11-TRUE16-NEXT: s_and_b32 s8, s24, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s25, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s10, s26, 0xff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s27, 8
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v35
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
+; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v39
-; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff
-; GFX11-TRUE16-NEXT: v_and_b32_e64 v1, 0xffff, s10
-; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v38
-; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v31
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
+; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_and_b32 v1, 0xff, v38
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v39
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v31
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v48
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v82
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v49
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v51
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v5, v50
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v6, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v53
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v24
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v7, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v18
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v8, 16, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v54
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v67
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v19
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v8, v23
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v2, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v3, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v30
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v80
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v82
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v2, v49
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v50
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v36
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v52
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v34
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v53
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v20
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v17
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v18
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v3, v54
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v19
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v1, v55
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v27
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v10
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v21
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v26
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v3, v23
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v65
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v66
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v70
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v12, v71
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v69
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v83
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v64
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v84
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v65
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v25
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v29
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v87, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v96, v12, v81
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v97, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v86, v86, v85
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v0, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v1, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v15, 16, v87
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v96, 16, v97
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v86, 16, v98
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v30
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v2, v27
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v29
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v68
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v1, v66
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v67
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v69
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v70
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v64
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v80
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v71
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v81
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v84
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v85
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v3, v83
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s5
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB111_3
; GFX11-TRUE16-NEXT: .LBB111_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v68
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v67
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v30
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v65
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v16
; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v70, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v71, v5
-; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff
; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v66, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v28
; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v29, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v24
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v27, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v25, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v23, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v34
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v55, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v19, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v54, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v17, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v36
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v53, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v33
; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5
; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8
@@ -92898,11 +94184,6 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v31
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v37
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v7
; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7
; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8
@@ -92920,83 +94201,133 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8
; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v51, v4
; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10
; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0
; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2
-; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v50, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v49, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v35
+; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v64
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v82
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v80
+; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
+; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v64
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v69
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v80
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v68
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v67
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v84, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v85, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v28
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v83, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v30
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v22
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v69
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v70, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v65
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_add_nc_u32 v13, 0x300, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v66, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v26
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v48, v8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4
-; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v85, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v71, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v29, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v25, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v23, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v20
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v34
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v55, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v32
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v19, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v17, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v36
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v53, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v33
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v31
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v37
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v19
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v38
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v51, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v50, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v49, v19
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 3, v35
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v19
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v48, v20
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v23
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v81, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e64 v8, 0xffff, s4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v24.l
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v19
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v9, 16, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v23, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v22, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v19, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v20
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v18, 16, v15
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v14, 16, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v20.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_add_nc_u32 v19, 0x300, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v17.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v18.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v3.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3
; GFX11-TRUE16-NEXT: .LBB111_3: ; %end
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
index a1c0a87..5d4df4b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
@@ -10227,149 +10227,285 @@ define inreg <36 x i16> @bitcast_v18f32_to_v36i16_scalar(<18 x float> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr22
; GFX9-NEXT: s_branch .LBB29_2
;
-; GFX11-LABEL: bitcast_v18f32_to_v36i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB29_3
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: s_lshr_b32 s5, s29, 16
-; GFX11-NEXT: s_lshr_b32 s6, s28, 16
-; GFX11-NEXT: s_lshr_b32 s7, s27, 16
-; GFX11-NEXT: s_lshr_b32 s8, s26, 16
-; GFX11-NEXT: s_lshr_b32 s9, s25, 16
-; GFX11-NEXT: s_lshr_b32 s10, s24, 16
-; GFX11-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-NEXT: s_lshr_b32 s12, s22, 16
-; GFX11-NEXT: s_lshr_b32 s13, s21, 16
-; GFX11-NEXT: s_lshr_b32 s14, s20, 16
-; GFX11-NEXT: s_lshr_b32 s15, s19, 16
-; GFX11-NEXT: s_lshr_b32 s40, s18, 16
-; GFX11-NEXT: s_lshr_b32 s41, s17, 16
-; GFX11-NEXT: s_lshr_b32 s42, s16, 16
-; GFX11-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB29_4
-; GFX11-NEXT: .LBB29_2: ; %cmp.true
-; GFX11-NEXT: v_add_f32_e64 v13, s29, 1.0
-; GFX11-NEXT: v_add_f32_e64 v14, s28, 1.0
-; GFX11-NEXT: v_add_f32_e64 v15, s27, 1.0
-; GFX11-NEXT: v_add_f32_e64 v16, s26, 1.0
-; GFX11-NEXT: v_add_f32_e64 v17, s25, 1.0
-; GFX11-NEXT: v_add_f32_e64 v8, s24, 1.0
-; GFX11-NEXT: v_add_f32_e64 v9, s23, 1.0
-; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0
-; GFX11-NEXT: v_add_f32_e64 v11, s21, 1.0
-; GFX11-NEXT: v_add_f32_e64 v12, s20, 1.0
-; GFX11-NEXT: v_add_f32_e64 v3, s19, 1.0
-; GFX11-NEXT: v_add_f32_e64 v4, s18, 1.0
-; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0
-; GFX11-NEXT: v_add_f32_e64 v6, s16, 1.0
-; GFX11-NEXT: v_add_f32_e64 v7, s3, 1.0
-; GFX11-NEXT: v_add_f32_e64 v0, s2, 1.0
-; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0
-; GFX11-NEXT: v_add_f32_e64 v2, s0, 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v2
-; GFX11-NEXT: s_branch .LBB29_5
-; GFX11-NEXT: .LBB29_3:
-; GFX11-NEXT: ; implicit-def: $sgpr46
-; GFX11-NEXT: ; implicit-def: $sgpr45
-; GFX11-NEXT: ; implicit-def: $sgpr44
-; GFX11-NEXT: ; implicit-def: $sgpr43
-; GFX11-NEXT: ; implicit-def: $sgpr42
-; GFX11-NEXT: ; implicit-def: $sgpr41
-; GFX11-NEXT: ; implicit-def: $sgpr40
-; GFX11-NEXT: ; implicit-def: $sgpr15
-; GFX11-NEXT: ; implicit-def: $sgpr14
-; GFX11-NEXT: ; implicit-def: $sgpr13
-; GFX11-NEXT: ; implicit-def: $sgpr12
-; GFX11-NEXT: ; implicit-def: $sgpr11
-; GFX11-NEXT: ; implicit-def: $sgpr10
-; GFX11-NEXT: ; implicit-def: $sgpr9
-; GFX11-NEXT: ; implicit-def: $sgpr8
-; GFX11-NEXT: ; implicit-def: $sgpr7
-; GFX11-NEXT: ; implicit-def: $sgpr6
-; GFX11-NEXT: ; implicit-def: $sgpr5
-; GFX11-NEXT: s_branch .LBB29_2
-; GFX11-NEXT: .LBB29_4:
-; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v7, s3
-; GFX11-NEXT: v_dual_mov_b32 v6, s16 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT: v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v3, s19
-; GFX11-NEXT: v_dual_mov_b32 v12, s20 :: v_dual_mov_b32 v11, s21
-; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v9, s23
-; GFX11-NEXT: v_dual_mov_b32 v8, s24 :: v_dual_mov_b32 v17, s25
-; GFX11-NEXT: v_dual_mov_b32 v16, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: v_dual_mov_b32 v14, s28 :: v_dual_mov_b32 v13, s29
-; GFX11-NEXT: v_dual_mov_b32 v35, s46 :: v_dual_mov_b32 v34, s45
-; GFX11-NEXT: v_dual_mov_b32 v33, s44 :: v_dual_mov_b32 v32, s43
-; GFX11-NEXT: v_dual_mov_b32 v31, s42 :: v_dual_mov_b32 v30, s41
-; GFX11-NEXT: v_dual_mov_b32 v29, s40 :: v_dual_mov_b32 v28, s15
-; GFX11-NEXT: v_dual_mov_b32 v27, s14 :: v_dual_mov_b32 v26, s13
-; GFX11-NEXT: v_dual_mov_b32 v25, s12 :: v_dual_mov_b32 v24, s11
-; GFX11-NEXT: v_dual_mov_b32 v23, s10 :: v_dual_mov_b32 v22, s9
-; GFX11-NEXT: v_dual_mov_b32 v21, s8 :: v_dual_mov_b32 v20, s7
-; GFX11-NEXT: v_dual_mov_b32 v19, s6 :: v_dual_mov_b32 v18, s5
-; GFX11-NEXT: .LBB29_5: ; %end
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT: v_lshl_or_b32 v0, v35, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v1, v34, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v2, v33, 16, v36
-; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v3, v32, 16, v7
-; GFX11-NEXT: v_lshl_or_b32 v4, v31, 16, v6
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT: v_lshl_or_b32 v6, v29, 16, v33
-; GFX11-NEXT: v_lshl_or_b32 v7, v28, 16, v34
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v9
-; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v8
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT: v_lshl_or_b32 v8, v27, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v9, v26, 16, v11
-; GFX11-NEXT: v_lshl_or_b32 v11, v24, 16, v28
-; GFX11-NEXT: v_lshl_or_b32 v12, v23, 16, v29
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v14
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v13
-; GFX11-NEXT: v_lshl_or_b32 v5, v30, 16, v5
-; GFX11-NEXT: v_lshl_or_b32 v10, v25, 16, v10
-; GFX11-NEXT: v_lshl_or_b32 v13, v22, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v14, v21, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v15, v20, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v16, v19, 16, v23
-; GFX11-NEXT: v_lshl_or_b32 v17, v18, 16, v24
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v18f32_to_v36i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB29_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB29_4
+; GFX11-TRUE16-NEXT: .LBB29_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, s29, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, s28, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v15, s27, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v14, s26, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, s25, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, s24, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, s23, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, s22, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, s21, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, s20, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, s19, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, s18, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, s17, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, s16, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, s3, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, s2, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, s1, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, s0, 1.0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-TRUE16-NEXT: s_branch .LBB29_5
+; GFX11-TRUE16-NEXT: .LBB29_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5
+; GFX11-TRUE16-NEXT: s_branch .LBB29_2
+; GFX11-TRUE16-NEXT: .LBB29_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, s46 :: v_dual_mov_b32 v34, s45
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, s44 :: v_dual_mov_b32 v32, s43
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, s42 :: v_dual_mov_b32 v30, s41
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, s40 :: v_dual_mov_b32 v28, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, s14 :: v_dual_mov_b32 v26, s13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, s12 :: v_dual_mov_b32 v24, s11
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, s10 :: v_dual_mov_b32 v22, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, s8 :: v_dual_mov_b32 v20, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, s6 :: v_dual_mov_b32 v18, s5
+; GFX11-TRUE16-NEXT: .LBB29_5: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v21 :: v_dual_mov_b32 v20, v20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v19 :: v_dual_mov_b32 v18, v18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v18.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v18f32_to_v36i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB29_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s29, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s28, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s27, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s26, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s25, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s22, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s20, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s19, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s18, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s17, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s16, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB29_4
+; GFX11-FAKE16-NEXT: .LBB29_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, s29, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, s28, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v15, s27, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, s26, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, s25, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, s24, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, s23, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, s22, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, s21, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, s20, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, s19, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, s18, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, s17, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, s16, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, s3, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, s2, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, s1, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, s0, 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2
+; GFX11-FAKE16-NEXT: s_branch .LBB29_5
+; GFX11-FAKE16-NEXT: .LBB29_3:
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5
+; GFX11-FAKE16-NEXT: s_branch .LBB29_2
+; GFX11-FAKE16-NEXT: .LBB29_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v7, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v3, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s20 :: v_dual_mov_b32 v11, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v9, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s24 :: v_dual_mov_b32 v17, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s28 :: v_dual_mov_b32 v13, s29
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, s46 :: v_dual_mov_b32 v34, s45
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v33, s44 :: v_dual_mov_b32 v32, s43
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v31, s42 :: v_dual_mov_b32 v30, s41
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v29, s40 :: v_dual_mov_b32 v28, s15
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, s14 :: v_dual_mov_b32 v26, s13
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, s12 :: v_dual_mov_b32 v24, s11
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, s10 :: v_dual_mov_b32 v22, s9
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, s8 :: v_dual_mov_b32 v20, s7
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, s6 :: v_dual_mov_b32 v18, s5
+; GFX11-FAKE16-NEXT: .LBB29_5: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v35, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v33, 16, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v32, 16, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v31, 16, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v29, 16, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v28, 16, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v27, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v26, 16, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v24, 16, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v23, 16, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v25, 16, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v22, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v21, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v20, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v19, 16, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v24
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -12999,149 +13135,285 @@ define inreg <36 x half> @bitcast_v18f32_to_v36f16_scalar(<18 x float> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr22
; GFX9-NEXT: s_branch .LBB33_2
;
-; GFX11-LABEL: bitcast_v18f32_to_v36f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB33_3
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: s_lshr_b32 s5, s29, 16
-; GFX11-NEXT: s_lshr_b32 s6, s28, 16
-; GFX11-NEXT: s_lshr_b32 s7, s27, 16
-; GFX11-NEXT: s_lshr_b32 s8, s26, 16
-; GFX11-NEXT: s_lshr_b32 s9, s25, 16
-; GFX11-NEXT: s_lshr_b32 s10, s24, 16
-; GFX11-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-NEXT: s_lshr_b32 s12, s22, 16
-; GFX11-NEXT: s_lshr_b32 s13, s21, 16
-; GFX11-NEXT: s_lshr_b32 s14, s20, 16
-; GFX11-NEXT: s_lshr_b32 s15, s19, 16
-; GFX11-NEXT: s_lshr_b32 s40, s18, 16
-; GFX11-NEXT: s_lshr_b32 s41, s17, 16
-; GFX11-NEXT: s_lshr_b32 s42, s16, 16
-; GFX11-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB33_4
-; GFX11-NEXT: .LBB33_2: ; %cmp.true
-; GFX11-NEXT: v_add_f32_e64 v13, s29, 1.0
-; GFX11-NEXT: v_add_f32_e64 v14, s28, 1.0
-; GFX11-NEXT: v_add_f32_e64 v15, s27, 1.0
-; GFX11-NEXT: v_add_f32_e64 v16, s26, 1.0
-; GFX11-NEXT: v_add_f32_e64 v17, s25, 1.0
-; GFX11-NEXT: v_add_f32_e64 v8, s24, 1.0
-; GFX11-NEXT: v_add_f32_e64 v9, s23, 1.0
-; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0
-; GFX11-NEXT: v_add_f32_e64 v11, s21, 1.0
-; GFX11-NEXT: v_add_f32_e64 v12, s20, 1.0
-; GFX11-NEXT: v_add_f32_e64 v3, s19, 1.0
-; GFX11-NEXT: v_add_f32_e64 v4, s18, 1.0
-; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0
-; GFX11-NEXT: v_add_f32_e64 v6, s16, 1.0
-; GFX11-NEXT: v_add_f32_e64 v7, s3, 1.0
-; GFX11-NEXT: v_add_f32_e64 v0, s2, 1.0
-; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0
-; GFX11-NEXT: v_add_f32_e64 v2, s0, 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v2
-; GFX11-NEXT: s_branch .LBB33_5
-; GFX11-NEXT: .LBB33_3:
-; GFX11-NEXT: ; implicit-def: $sgpr46
-; GFX11-NEXT: ; implicit-def: $sgpr45
-; GFX11-NEXT: ; implicit-def: $sgpr44
-; GFX11-NEXT: ; implicit-def: $sgpr43
-; GFX11-NEXT: ; implicit-def: $sgpr42
-; GFX11-NEXT: ; implicit-def: $sgpr41
-; GFX11-NEXT: ; implicit-def: $sgpr40
-; GFX11-NEXT: ; implicit-def: $sgpr15
-; GFX11-NEXT: ; implicit-def: $sgpr14
-; GFX11-NEXT: ; implicit-def: $sgpr13
-; GFX11-NEXT: ; implicit-def: $sgpr12
-; GFX11-NEXT: ; implicit-def: $sgpr11
-; GFX11-NEXT: ; implicit-def: $sgpr10
-; GFX11-NEXT: ; implicit-def: $sgpr9
-; GFX11-NEXT: ; implicit-def: $sgpr8
-; GFX11-NEXT: ; implicit-def: $sgpr7
-; GFX11-NEXT: ; implicit-def: $sgpr6
-; GFX11-NEXT: ; implicit-def: $sgpr5
-; GFX11-NEXT: s_branch .LBB33_2
-; GFX11-NEXT: .LBB33_4:
-; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v7, s3
-; GFX11-NEXT: v_dual_mov_b32 v6, s16 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT: v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v3, s19
-; GFX11-NEXT: v_dual_mov_b32 v12, s20 :: v_dual_mov_b32 v11, s21
-; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v9, s23
-; GFX11-NEXT: v_dual_mov_b32 v8, s24 :: v_dual_mov_b32 v17, s25
-; GFX11-NEXT: v_dual_mov_b32 v16, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: v_dual_mov_b32 v14, s28 :: v_dual_mov_b32 v13, s29
-; GFX11-NEXT: v_dual_mov_b32 v35, s46 :: v_dual_mov_b32 v34, s45
-; GFX11-NEXT: v_dual_mov_b32 v33, s44 :: v_dual_mov_b32 v32, s43
-; GFX11-NEXT: v_dual_mov_b32 v31, s42 :: v_dual_mov_b32 v30, s41
-; GFX11-NEXT: v_dual_mov_b32 v29, s40 :: v_dual_mov_b32 v28, s15
-; GFX11-NEXT: v_dual_mov_b32 v27, s14 :: v_dual_mov_b32 v26, s13
-; GFX11-NEXT: v_dual_mov_b32 v25, s12 :: v_dual_mov_b32 v24, s11
-; GFX11-NEXT: v_dual_mov_b32 v23, s10 :: v_dual_mov_b32 v22, s9
-; GFX11-NEXT: v_dual_mov_b32 v21, s8 :: v_dual_mov_b32 v20, s7
-; GFX11-NEXT: v_dual_mov_b32 v19, s6 :: v_dual_mov_b32 v18, s5
-; GFX11-NEXT: .LBB33_5: ; %end
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT: v_lshl_or_b32 v0, v35, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v1, v34, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v2, v33, 16, v36
-; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v3, v32, 16, v7
-; GFX11-NEXT: v_lshl_or_b32 v4, v31, 16, v6
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT: v_lshl_or_b32 v6, v29, 16, v33
-; GFX11-NEXT: v_lshl_or_b32 v7, v28, 16, v34
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v9
-; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v8
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT: v_lshl_or_b32 v8, v27, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v9, v26, 16, v11
-; GFX11-NEXT: v_lshl_or_b32 v11, v24, 16, v28
-; GFX11-NEXT: v_lshl_or_b32 v12, v23, 16, v29
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v14
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v13
-; GFX11-NEXT: v_lshl_or_b32 v5, v30, 16, v5
-; GFX11-NEXT: v_lshl_or_b32 v10, v25, 16, v10
-; GFX11-NEXT: v_lshl_or_b32 v13, v22, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v14, v21, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v15, v20, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v16, v19, 16, v23
-; GFX11-NEXT: v_lshl_or_b32 v17, v18, 16, v24
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v18f32_to_v36f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB33_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB33_4
+; GFX11-TRUE16-NEXT: .LBB33_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, s29, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, s28, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v15, s27, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v14, s26, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, s25, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, s24, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, s23, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, s22, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, s21, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, s20, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, s19, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, s18, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, s17, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, s16, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, s3, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, s2, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, s1, 1.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, s0, 1.0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-TRUE16-NEXT: s_branch .LBB33_5
+; GFX11-TRUE16-NEXT: .LBB33_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5
+; GFX11-TRUE16-NEXT: s_branch .LBB33_2
+; GFX11-TRUE16-NEXT: .LBB33_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, s46 :: v_dual_mov_b32 v34, s45
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, s44 :: v_dual_mov_b32 v32, s43
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, s42 :: v_dual_mov_b32 v30, s41
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, s40 :: v_dual_mov_b32 v28, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, s14 :: v_dual_mov_b32 v26, s13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, s12 :: v_dual_mov_b32 v24, s11
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, s10 :: v_dual_mov_b32 v22, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, s8 :: v_dual_mov_b32 v20, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, s6 :: v_dual_mov_b32 v18, s5
+; GFX11-TRUE16-NEXT: .LBB33_5: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v21 :: v_dual_mov_b32 v20, v20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v19 :: v_dual_mov_b32 v18, v18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v18.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v18f32_to_v36f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB33_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s29, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s28, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s27, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s26, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s25, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s22, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s20, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s19, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s18, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s17, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s16, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB33_4
+; GFX11-FAKE16-NEXT: .LBB33_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, s29, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, s28, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v15, s27, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, s26, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, s25, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, s24, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, s23, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, s22, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, s21, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, s20, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, s19, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, s18, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, s17, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, s16, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, s3, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, s2, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, s1, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, s0, 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2
+; GFX11-FAKE16-NEXT: s_branch .LBB33_5
+; GFX11-FAKE16-NEXT: .LBB33_3:
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5
+; GFX11-FAKE16-NEXT: s_branch .LBB33_2
+; GFX11-FAKE16-NEXT: .LBB33_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v7, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v3, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s20 :: v_dual_mov_b32 v11, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v9, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s24 :: v_dual_mov_b32 v17, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s28 :: v_dual_mov_b32 v13, s29
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, s46 :: v_dual_mov_b32 v34, s45
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v33, s44 :: v_dual_mov_b32 v32, s43
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v31, s42 :: v_dual_mov_b32 v30, s41
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v29, s40 :: v_dual_mov_b32 v28, s15
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, s14 :: v_dual_mov_b32 v26, s13
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, s12 :: v_dual_mov_b32 v24, s11
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, s10 :: v_dual_mov_b32 v22, s9
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, s8 :: v_dual_mov_b32 v20, s7
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, s6 :: v_dual_mov_b32 v18, s5
+; GFX11-FAKE16-NEXT: .LBB33_5: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v35, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v33, 16, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v32, 16, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v31, 16, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v29, 16, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v28, 16, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v27, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v26, 16, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v24, 16, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v23, 16, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v25, 16, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v22, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v21, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v20, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v19, 16, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v24
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -21895,140 +22167,270 @@ define inreg <36 x i16> @bitcast_v9f64_to_v36i16_scalar(<9 x double> inreg %a, i
; GFX9-NEXT: ; implicit-def: $vgpr22
; GFX9-NEXT: s_branch .LBB49_2
;
-; GFX11-LABEL: bitcast_v9f64_to_v36i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB49_3
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: s_lshr_b32 s5, s29, 16
-; GFX11-NEXT: s_lshr_b32 s14, s28, 16
-; GFX11-NEXT: s_lshr_b32 s6, s27, 16
-; GFX11-NEXT: s_lshr_b32 s15, s26, 16
-; GFX11-NEXT: s_lshr_b32 s7, s25, 16
-; GFX11-NEXT: s_lshr_b32 s40, s24, 16
-; GFX11-NEXT: s_lshr_b32 s8, s23, 16
-; GFX11-NEXT: s_lshr_b32 s41, s22, 16
-; GFX11-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-NEXT: s_lshr_b32 s42, s20, 16
-; GFX11-NEXT: s_lshr_b32 s10, s19, 16
-; GFX11-NEXT: s_lshr_b32 s43, s18, 16
-; GFX11-NEXT: s_lshr_b32 s11, s17, 16
-; GFX11-NEXT: s_lshr_b32 s44, s16, 16
-; GFX11-NEXT: s_lshr_b32 s12, s3, 16
-; GFX11-NEXT: s_lshr_b32 s45, s2, 16
-; GFX11-NEXT: s_lshr_b32 s13, s1, 16
-; GFX11-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB49_4
-; GFX11-NEXT: .LBB49_2: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[13:14], s[28:29], 1.0
-; GFX11-NEXT: v_add_f64 v[15:16], s[26:27], 1.0
-; GFX11-NEXT: v_add_f64 v[17:18], s[24:25], 1.0
-; GFX11-NEXT: v_add_f64 v[8:9], s[22:23], 1.0
-; GFX11-NEXT: v_add_f64 v[10:11], s[20:21], 1.0
-; GFX11-NEXT: v_add_f64 v[3:4], s[18:19], 1.0
-; GFX11-NEXT: v_add_f64 v[5:6], s[16:17], 1.0
-; GFX11-NEXT: v_add_f64 v[19:20], s[2:3], 1.0
-; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v0
-; GFX11-NEXT: s_branch .LBB49_5
-; GFX11-NEXT: .LBB49_3:
-; GFX11-NEXT: ; implicit-def: $sgpr46
-; GFX11-NEXT: ; implicit-def: $sgpr13
-; GFX11-NEXT: ; implicit-def: $sgpr45
-; GFX11-NEXT: ; implicit-def: $sgpr12
-; GFX11-NEXT: ; implicit-def: $sgpr44
-; GFX11-NEXT: ; implicit-def: $sgpr11
-; GFX11-NEXT: ; implicit-def: $sgpr43
-; GFX11-NEXT: ; implicit-def: $sgpr10
-; GFX11-NEXT: ; implicit-def: $sgpr42
-; GFX11-NEXT: ; implicit-def: $sgpr9
-; GFX11-NEXT: ; implicit-def: $sgpr41
-; GFX11-NEXT: ; implicit-def: $sgpr8
-; GFX11-NEXT: ; implicit-def: $sgpr40
-; GFX11-NEXT: ; implicit-def: $sgpr7
-; GFX11-NEXT: ; implicit-def: $sgpr15
-; GFX11-NEXT: ; implicit-def: $sgpr6
-; GFX11-NEXT: ; implicit-def: $sgpr14
-; GFX11-NEXT: ; implicit-def: $sgpr5
-; GFX11-NEXT: s_branch .LBB49_2
-; GFX11-NEXT: .LBB49_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v19, s2
-; GFX11-NEXT: v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v10, s20
-; GFX11-NEXT: v_dual_mov_b32 v3, s18 :: v_dual_mov_b32 v8, s22
-; GFX11-NEXT: v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v20, s3
-; GFX11-NEXT: v_dual_mov_b32 v15, s26 :: v_dual_mov_b32 v6, s17
-; GFX11-NEXT: v_dual_mov_b32 v13, s28 :: v_dual_mov_b32 v4, s19
-; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v18, s25
-; GFX11-NEXT: v_dual_mov_b32 v11, s21 :: v_dual_mov_b32 v16, s27
-; GFX11-NEXT: v_dual_mov_b32 v9, s23 :: v_dual_mov_b32 v14, s29
-; GFX11-NEXT: v_dual_mov_b32 v34, s46 :: v_dual_mov_b32 v7, s43
-; GFX11-NEXT: v_dual_mov_b32 v2, s45 :: v_dual_mov_b32 v27, s42
-; GFX11-NEXT: v_dual_mov_b32 v30, s44 :: v_dual_mov_b32 v21, s14
-; GFX11-NEXT: v_dual_mov_b32 v26, s41 :: v_dual_mov_b32 v35, s13
-; GFX11-NEXT: v_dual_mov_b32 v12, s40 :: v_dual_mov_b32 v33, s12
-; GFX11-NEXT: v_dual_mov_b32 v22, s15 :: v_dual_mov_b32 v31, s10
-; GFX11-NEXT: v_dual_mov_b32 v32, s11 :: v_dual_mov_b32 v29, s9
-; GFX11-NEXT: v_dual_mov_b32 v28, s8 :: v_dual_mov_b32 v25, s7
-; GFX11-NEXT: v_dual_mov_b32 v24, s6 :: v_dual_mov_b32 v23, s5
-; GFX11-NEXT: .LBB49_5: ; %end
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT: v_lshl_or_b32 v2, v2, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v20
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshl_or_b32 v0, v34, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v3, v33, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v8
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v4
-; GFX11-NEXT: v_lshl_or_b32 v4, v30, 16, v5
-; GFX11-NEXT: v_lshl_or_b32 v5, v32, 16, v6
-; GFX11-NEXT: v_lshl_or_b32 v6, v7, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v9
-; GFX11-NEXT: v_lshl_or_b32 v8, v27, 16, v10
-; GFX11-NEXT: v_lshl_or_b32 v10, v26, 16, v19
-; GFX11-NEXT: v_lshl_or_b32 v12, v12, 16, v17
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v18
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v13
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v14
-; GFX11-NEXT: v_lshl_or_b32 v1, v35, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v7, v31, 16, v34
-; GFX11-NEXT: v_lshl_or_b32 v9, v29, 16, v11
-; GFX11-NEXT: v_lshl_or_b32 v11, v28, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v13, v25, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v14, v22, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v15, v24, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v16, v21, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v17, v23, 16, v19
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v9f64_to_v36i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_4
+; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], s[28:29], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], s[26:27], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], s[24:25], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], s[22:23], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], s[20:21], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], s[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], s[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], s[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], s[0:1], 1.0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v0
+; GFX11-TRUE16-NEXT: s_branch .LBB49_5
+; GFX11-TRUE16-NEXT: .LBB49_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5
+; GFX11-TRUE16-NEXT: s_branch .LBB49_2
+; GFX11-TRUE16-NEXT: .LBB49_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s46 :: v_dual_mov_b32 v35, s13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s45 :: v_dual_mov_b32 v33, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s44 :: v_dual_mov_b32 v31, s11
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s43 :: v_dual_mov_b32 v29, s10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s42 :: v_dual_mov_b32 v27, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s41 :: v_dual_mov_b32 v25, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s40 :: v_dual_mov_b32 v23, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s15 :: v_dual_mov_b32 v21, s6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s14 :: v_dual_mov_b32 v19, s5
+; GFX11-TRUE16-NEXT: .LBB49_5: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v32 :: v_dual_mov_b32 v31, v31
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v22, v22
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v34 :: v_dual_mov_b32 v35, v35
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v32.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v33 :: v_dual_mov_b32 v27, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v30 :: v_dual_mov_b32 v29, v29
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v28 :: v_dual_mov_b32 v25, v25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v26 :: v_dual_mov_b32 v21, v21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v24 :: v_dual_mov_b32 v19, v19
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v22, v23
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v20, v20
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v18, v18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v19.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v9f64_to_v36i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s29, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s27, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s25, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s23, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s22, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s20, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s19, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s18, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s17, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s3, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s1, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_4
+; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[13:14], s[28:29], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[15:16], s[26:27], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[17:18], s[24:25], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], s[22:23], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], s[20:21], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[3:4], s[18:19], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[5:6], s[16:17], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[19:20], s[2:3], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], s[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v0
+; GFX11-FAKE16-NEXT: s_branch .LBB49_5
+; GFX11-FAKE16-NEXT: .LBB49_3:
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5
+; GFX11-FAKE16-NEXT: s_branch .LBB49_2
+; GFX11-FAKE16-NEXT: .LBB49_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v19, s2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v10, s20
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s18 :: v_dual_mov_b32 v8, s22
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v20, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s26 :: v_dual_mov_b32 v6, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s28 :: v_dual_mov_b32 v4, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v18, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s21 :: v_dual_mov_b32 v16, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s23 :: v_dual_mov_b32 v14, s29
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s46 :: v_dual_mov_b32 v7, s43
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s45 :: v_dual_mov_b32 v27, s42
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s44 :: v_dual_mov_b32 v21, s14
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s41 :: v_dual_mov_b32 v35, s13
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s40 :: v_dual_mov_b32 v33, s12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s15 :: v_dual_mov_b32 v31, s10
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s11 :: v_dual_mov_b32 v29, s9
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s8 :: v_dual_mov_b32 v25, s7
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s6 :: v_dual_mov_b32 v23, s5
+; GFX11-FAKE16-NEXT: .LBB49_5: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v2, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v34, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v33, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v32, 16, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v7, 16, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v27, 16, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v26, 16, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v12, 16, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v35, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v31, 16, v34
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v29, 16, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v28, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v25, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v22, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v24, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v21, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v23, 16, v19
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -24595,140 +24997,270 @@ define inreg <36 x half> @bitcast_v9f64_to_v36f16_scalar(<9 x double> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr22
; GFX9-NEXT: s_branch .LBB53_2
;
-; GFX11-LABEL: bitcast_v9f64_to_v36f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB53_3
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: s_lshr_b32 s5, s29, 16
-; GFX11-NEXT: s_lshr_b32 s14, s28, 16
-; GFX11-NEXT: s_lshr_b32 s6, s27, 16
-; GFX11-NEXT: s_lshr_b32 s15, s26, 16
-; GFX11-NEXT: s_lshr_b32 s7, s25, 16
-; GFX11-NEXT: s_lshr_b32 s40, s24, 16
-; GFX11-NEXT: s_lshr_b32 s8, s23, 16
-; GFX11-NEXT: s_lshr_b32 s41, s22, 16
-; GFX11-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-NEXT: s_lshr_b32 s42, s20, 16
-; GFX11-NEXT: s_lshr_b32 s10, s19, 16
-; GFX11-NEXT: s_lshr_b32 s43, s18, 16
-; GFX11-NEXT: s_lshr_b32 s11, s17, 16
-; GFX11-NEXT: s_lshr_b32 s44, s16, 16
-; GFX11-NEXT: s_lshr_b32 s12, s3, 16
-; GFX11-NEXT: s_lshr_b32 s45, s2, 16
-; GFX11-NEXT: s_lshr_b32 s13, s1, 16
-; GFX11-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB53_4
-; GFX11-NEXT: .LBB53_2: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[13:14], s[28:29], 1.0
-; GFX11-NEXT: v_add_f64 v[15:16], s[26:27], 1.0
-; GFX11-NEXT: v_add_f64 v[17:18], s[24:25], 1.0
-; GFX11-NEXT: v_add_f64 v[8:9], s[22:23], 1.0
-; GFX11-NEXT: v_add_f64 v[10:11], s[20:21], 1.0
-; GFX11-NEXT: v_add_f64 v[3:4], s[18:19], 1.0
-; GFX11-NEXT: v_add_f64 v[5:6], s[16:17], 1.0
-; GFX11-NEXT: v_add_f64 v[19:20], s[2:3], 1.0
-; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v0
-; GFX11-NEXT: s_branch .LBB53_5
-; GFX11-NEXT: .LBB53_3:
-; GFX11-NEXT: ; implicit-def: $sgpr46
-; GFX11-NEXT: ; implicit-def: $sgpr13
-; GFX11-NEXT: ; implicit-def: $sgpr45
-; GFX11-NEXT: ; implicit-def: $sgpr12
-; GFX11-NEXT: ; implicit-def: $sgpr44
-; GFX11-NEXT: ; implicit-def: $sgpr11
-; GFX11-NEXT: ; implicit-def: $sgpr43
-; GFX11-NEXT: ; implicit-def: $sgpr10
-; GFX11-NEXT: ; implicit-def: $sgpr42
-; GFX11-NEXT: ; implicit-def: $sgpr9
-; GFX11-NEXT: ; implicit-def: $sgpr41
-; GFX11-NEXT: ; implicit-def: $sgpr8
-; GFX11-NEXT: ; implicit-def: $sgpr40
-; GFX11-NEXT: ; implicit-def: $sgpr7
-; GFX11-NEXT: ; implicit-def: $sgpr15
-; GFX11-NEXT: ; implicit-def: $sgpr6
-; GFX11-NEXT: ; implicit-def: $sgpr14
-; GFX11-NEXT: ; implicit-def: $sgpr5
-; GFX11-NEXT: s_branch .LBB53_2
-; GFX11-NEXT: .LBB53_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v19, s2
-; GFX11-NEXT: v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v10, s20
-; GFX11-NEXT: v_dual_mov_b32 v3, s18 :: v_dual_mov_b32 v8, s22
-; GFX11-NEXT: v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v20, s3
-; GFX11-NEXT: v_dual_mov_b32 v15, s26 :: v_dual_mov_b32 v6, s17
-; GFX11-NEXT: v_dual_mov_b32 v13, s28 :: v_dual_mov_b32 v4, s19
-; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v18, s25
-; GFX11-NEXT: v_dual_mov_b32 v11, s21 :: v_dual_mov_b32 v16, s27
-; GFX11-NEXT: v_dual_mov_b32 v9, s23 :: v_dual_mov_b32 v14, s29
-; GFX11-NEXT: v_dual_mov_b32 v34, s46 :: v_dual_mov_b32 v7, s43
-; GFX11-NEXT: v_dual_mov_b32 v2, s45 :: v_dual_mov_b32 v27, s42
-; GFX11-NEXT: v_dual_mov_b32 v30, s44 :: v_dual_mov_b32 v21, s14
-; GFX11-NEXT: v_dual_mov_b32 v26, s41 :: v_dual_mov_b32 v35, s13
-; GFX11-NEXT: v_dual_mov_b32 v12, s40 :: v_dual_mov_b32 v33, s12
-; GFX11-NEXT: v_dual_mov_b32 v22, s15 :: v_dual_mov_b32 v31, s10
-; GFX11-NEXT: v_dual_mov_b32 v32, s11 :: v_dual_mov_b32 v29, s9
-; GFX11-NEXT: v_dual_mov_b32 v28, s8 :: v_dual_mov_b32 v25, s7
-; GFX11-NEXT: v_dual_mov_b32 v24, s6 :: v_dual_mov_b32 v23, s5
-; GFX11-NEXT: .LBB53_5: ; %end
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT: v_lshl_or_b32 v2, v2, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v20
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshl_or_b32 v0, v34, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v3, v33, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v8
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v4
-; GFX11-NEXT: v_lshl_or_b32 v4, v30, 16, v5
-; GFX11-NEXT: v_lshl_or_b32 v5, v32, 16, v6
-; GFX11-NEXT: v_lshl_or_b32 v6, v7, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v9
-; GFX11-NEXT: v_lshl_or_b32 v8, v27, 16, v10
-; GFX11-NEXT: v_lshl_or_b32 v10, v26, 16, v19
-; GFX11-NEXT: v_lshl_or_b32 v12, v12, 16, v17
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v18
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v13
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v14
-; GFX11-NEXT: v_lshl_or_b32 v1, v35, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v7, v31, 16, v34
-; GFX11-NEXT: v_lshl_or_b32 v9, v29, 16, v11
-; GFX11-NEXT: v_lshl_or_b32 v11, v28, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v13, v25, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v14, v22, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v15, v24, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v16, v21, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v17, v23, 16, v19
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v9f64_to_v36f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB53_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB53_4
+; GFX11-TRUE16-NEXT: .LBB53_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], s[28:29], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], s[26:27], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], s[24:25], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], s[22:23], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], s[20:21], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], s[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], s[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], s[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], s[0:1], 1.0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v0
+; GFX11-TRUE16-NEXT: s_branch .LBB53_5
+; GFX11-TRUE16-NEXT: .LBB53_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5
+; GFX11-TRUE16-NEXT: s_branch .LBB53_2
+; GFX11-TRUE16-NEXT: .LBB53_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s46 :: v_dual_mov_b32 v35, s13
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s45 :: v_dual_mov_b32 v33, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s44 :: v_dual_mov_b32 v31, s11
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s43 :: v_dual_mov_b32 v29, s10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s42 :: v_dual_mov_b32 v27, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s41 :: v_dual_mov_b32 v25, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s40 :: v_dual_mov_b32 v23, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s15 :: v_dual_mov_b32 v21, s6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s14 :: v_dual_mov_b32 v19, s5
+; GFX11-TRUE16-NEXT: .LBB53_5: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v32 :: v_dual_mov_b32 v31, v31
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v22, v22
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v34 :: v_dual_mov_b32 v35, v35
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v32.l
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v33 :: v_dual_mov_b32 v27, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v30 :: v_dual_mov_b32 v29, v29
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v28 :: v_dual_mov_b32 v25, v25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v26 :: v_dual_mov_b32 v21, v21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v24 :: v_dual_mov_b32 v19, v19
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v22, v23
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v20, v20
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v18, v18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v19.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v9f64_to_v36f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB53_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s29, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s27, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s25, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s23, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s22, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s20, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s19, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s18, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s17, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s3, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s1, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB53_4
+; GFX11-FAKE16-NEXT: .LBB53_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[13:14], s[28:29], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[15:16], s[26:27], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[17:18], s[24:25], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], s[22:23], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], s[20:21], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[3:4], s[18:19], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[5:6], s[16:17], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[19:20], s[2:3], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], s[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v0
+; GFX11-FAKE16-NEXT: s_branch .LBB53_5
+; GFX11-FAKE16-NEXT: .LBB53_3:
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5
+; GFX11-FAKE16-NEXT: s_branch .LBB53_2
+; GFX11-FAKE16-NEXT: .LBB53_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v19, s2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v10, s20
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s18 :: v_dual_mov_b32 v8, s22
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v20, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s26 :: v_dual_mov_b32 v6, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s28 :: v_dual_mov_b32 v4, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v18, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s21 :: v_dual_mov_b32 v16, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s23 :: v_dual_mov_b32 v14, s29
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s46 :: v_dual_mov_b32 v7, s43
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s45 :: v_dual_mov_b32 v27, s42
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s44 :: v_dual_mov_b32 v21, s14
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s41 :: v_dual_mov_b32 v35, s13
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s40 :: v_dual_mov_b32 v33, s12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s15 :: v_dual_mov_b32 v31, s10
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s11 :: v_dual_mov_b32 v29, s9
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s8 :: v_dual_mov_b32 v25, s7
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s6 :: v_dual_mov_b32 v23, s5
+; GFX11-FAKE16-NEXT: .LBB53_5: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v2, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v34, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v33, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v32, 16, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v7, 16, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v27, 16, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v26, 16, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v12, 16, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v35, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v31, 16, v34
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v29, 16, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v28, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v25, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v22, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v24, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v21, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v23, 16, v19
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -27654,149 +28186,285 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i
; GFX9-NEXT: v_mov_b32_e32 v3, v19
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v36i16_to_v36f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT: s_lshr_b32 s45, s29, 16
-; GFX11-NEXT: s_lshr_b32 s44, s28, 16
-; GFX11-NEXT: s_lshr_b32 s43, s27, 16
-; GFX11-NEXT: s_lshr_b32 s42, s26, 16
-; GFX11-NEXT: s_lshr_b32 s41, s25, 16
-; GFX11-NEXT: s_lshr_b32 s40, s24, 16
-; GFX11-NEXT: s_lshr_b32 s15, s23, 16
-; GFX11-NEXT: s_lshr_b32 s14, s22, 16
-; GFX11-NEXT: s_lshr_b32 s13, s21, 16
-; GFX11-NEXT: s_lshr_b32 s12, s20, 16
-; GFX11-NEXT: s_lshr_b32 s11, s19, 16
-; GFX11-NEXT: s_lshr_b32 s10, s18, 16
-; GFX11-NEXT: s_lshr_b32 s9, s17, 16
-; GFX11-NEXT: s_lshr_b32 s7, s16, 16
-; GFX11-NEXT: s_lshr_b32 s6, s3, 16
-; GFX11-NEXT: s_lshr_b32 s8, s2, 16
-; GFX11-NEXT: s_lshr_b32 s4, s1, 16
-; GFX11-NEXT: s_lshr_b32 s5, s0, 16
-; GFX11-NEXT: s_mov_b32 s46, 0
-; GFX11-NEXT: s_and_b32 s47, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB57_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
-; GFX11-NEXT: s_cbranch_vccnz .LBB57_4
-; GFX11-NEXT: .LBB57_2: ; %cmp.true
-; GFX11-NEXT: s_pack_ll_b32_b16 s29, s29, s45
-; GFX11-NEXT: s_pack_ll_b32_b16 s28, s28, s44
-; GFX11-NEXT: s_pack_ll_b32_b16 s27, s27, s43
-; GFX11-NEXT: s_pack_ll_b32_b16 s26, s26, s42
-; GFX11-NEXT: s_pack_ll_b32_b16 s25, s25, s41
-; GFX11-NEXT: s_pack_ll_b32_b16 s24, s24, s40
-; GFX11-NEXT: s_pack_ll_b32_b16 s15, s23, s15
-; GFX11-NEXT: s_pack_ll_b32_b16 s14, s22, s14
-; GFX11-NEXT: s_pack_ll_b32_b16 s13, s21, s13
-; GFX11-NEXT: s_pack_ll_b32_b16 s12, s20, s12
-; GFX11-NEXT: s_pack_ll_b32_b16 s11, s19, s11
-; GFX11-NEXT: s_pack_ll_b32_b16 s10, s18, s10
-; GFX11-NEXT: s_pack_ll_b32_b16 s9, s17, s9
-; GFX11-NEXT: s_pack_ll_b32_b16 s7, s16, s7
-; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s6
-; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s8
-; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s5
-; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-NEXT: v_pk_add_u16 v13, s29, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v14, s28, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v16, s26, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v17, s25, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v10, s14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v11, s13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v3, s11, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v5, s9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v0, s2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v7, s3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v13
-; GFX11-NEXT: s_branch .LBB57_5
-; GFX11-NEXT: .LBB57_3:
-; GFX11-NEXT: s_branch .LBB57_2
-; GFX11-NEXT: .LBB57_4:
-; GFX11-NEXT: v_dual_mov_b32 v13, s29 :: v_dual_mov_b32 v14, s28
-; GFX11-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s26
-; GFX11-NEXT: v_dual_mov_b32 v17, s25 :: v_dual_mov_b32 v8, s24
-; GFX11-NEXT: v_dual_mov_b32 v9, s23 :: v_dual_mov_b32 v10, s22
-; GFX11-NEXT: v_dual_mov_b32 v11, s21 :: v_dual_mov_b32 v12, s20
-; GFX11-NEXT: v_dual_mov_b32 v3, s19 :: v_dual_mov_b32 v4, s18
-; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s16
-; GFX11-NEXT: v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-NEXT: v_dual_mov_b32 v18, s45 :: v_dual_mov_b32 v19, s44
-; GFX11-NEXT: v_dual_mov_b32 v20, s43 :: v_dual_mov_b32 v21, s42
-; GFX11-NEXT: v_dual_mov_b32 v22, s41 :: v_dual_mov_b32 v23, s40
-; GFX11-NEXT: v_dual_mov_b32 v24, s15 :: v_dual_mov_b32 v25, s14
-; GFX11-NEXT: v_dual_mov_b32 v26, s13 :: v_dual_mov_b32 v27, s12
-; GFX11-NEXT: v_dual_mov_b32 v28, s11 :: v_dual_mov_b32 v29, s10
-; GFX11-NEXT: v_dual_mov_b32 v30, s9 :: v_dual_mov_b32 v31, s7
-; GFX11-NEXT: v_dual_mov_b32 v32, s6 :: v_dual_mov_b32 v33, s8
-; GFX11-NEXT: v_dual_mov_b32 v34, s4 :: v_dual_mov_b32 v35, s5
-; GFX11-NEXT: .LBB57_5: ; %end
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT: v_lshl_or_b32 v0, v35, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v1, v34, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v2, v33, 16, v36
-; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v3, v32, 16, v7
-; GFX11-NEXT: v_lshl_or_b32 v4, v31, 16, v6
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT: v_lshl_or_b32 v6, v29, 16, v33
-; GFX11-NEXT: v_lshl_or_b32 v7, v28, 16, v34
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v9
-; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v8
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT: v_lshl_or_b32 v8, v27, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v9, v26, 16, v11
-; GFX11-NEXT: v_lshl_or_b32 v11, v24, 16, v28
-; GFX11-NEXT: v_lshl_or_b32 v12, v23, 16, v29
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v14
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v13
-; GFX11-NEXT: v_lshl_or_b32 v5, v30, 16, v5
-; GFX11-NEXT: v_lshl_or_b32 v10, v25, 16, v10
-; GFX11-NEXT: v_lshl_or_b32 v13, v22, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v14, v21, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v15, v20, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v16, v19, 16, v23
-; GFX11-NEXT: v_lshl_or_b32 v17, v18, 16, v24
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v36i16_to_v36f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_4
+; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s29, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s28, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX11-TRUE16-NEXT: s_branch .LBB57_5
+; GFX11-TRUE16-NEXT: .LBB57_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB57_2
+; GFX11-TRUE16-NEXT: .LBB57_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s45 :: v_dual_mov_b32 v19, s44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s43 :: v_dual_mov_b32 v21, s42
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s41 :: v_dual_mov_b32 v23, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s15 :: v_dual_mov_b32 v25, s14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s13 :: v_dual_mov_b32 v27, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s11 :: v_dual_mov_b32 v29, s10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s9 :: v_dual_mov_b32 v31, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s6 :: v_dual_mov_b32 v33, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s4 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: .LBB57_5: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v21 :: v_dual_mov_b32 v20, v20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v19 :: v_dual_mov_b32 v18, v18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v18.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v36i16_to_v36f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s29, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s28, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s25, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s23, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s22, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s20, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s19, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s18, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s17, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16
+; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB57_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB57_4
+; GFX11-FAKE16-NEXT: .LBB57_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s29, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s28, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s26, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s25, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s2, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s3, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v13
+; GFX11-FAKE16-NEXT: s_branch .LBB57_5
+; GFX11-FAKE16-NEXT: .LBB57_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB57_2
+; GFX11-FAKE16-NEXT: .LBB57_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s29 :: v_dual_mov_b32 v14, s28
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s26
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s25 :: v_dual_mov_b32 v8, s24
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s21 :: v_dual_mov_b32 v12, s20
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s19 :: v_dual_mov_b32 v4, s18
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s16
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v0, s2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s45 :: v_dual_mov_b32 v19, s44
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s43 :: v_dual_mov_b32 v21, s42
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s41 :: v_dual_mov_b32 v23, s40
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s15 :: v_dual_mov_b32 v25, s14
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s13 :: v_dual_mov_b32 v27, s12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s11 :: v_dual_mov_b32 v29, s10
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s9 :: v_dual_mov_b32 v31, s7
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s6 :: v_dual_mov_b32 v33, s8
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s4 :: v_dual_mov_b32 v35, s5
+; GFX11-FAKE16-NEXT: .LBB57_5: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v35, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v33, 16, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v32, 16, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v31, 16, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v29, 16, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v28, 16, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v27, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v26, 16, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v24, 16, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v23, 16, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v25, 16, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v22, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v21, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v20, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v19, 16, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v24
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -29137,149 +29805,285 @@ define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i
; GFX9-NEXT: v_mov_b32_e32 v3, v19
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v36f16_to_v36i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT: s_lshr_b32 s45, s29, 16
-; GFX11-NEXT: s_lshr_b32 s44, s28, 16
-; GFX11-NEXT: s_lshr_b32 s43, s27, 16
-; GFX11-NEXT: s_lshr_b32 s42, s26, 16
-; GFX11-NEXT: s_lshr_b32 s41, s25, 16
-; GFX11-NEXT: s_lshr_b32 s40, s24, 16
-; GFX11-NEXT: s_lshr_b32 s15, s23, 16
-; GFX11-NEXT: s_lshr_b32 s14, s22, 16
-; GFX11-NEXT: s_lshr_b32 s13, s21, 16
-; GFX11-NEXT: s_lshr_b32 s12, s20, 16
-; GFX11-NEXT: s_lshr_b32 s11, s19, 16
-; GFX11-NEXT: s_lshr_b32 s10, s18, 16
-; GFX11-NEXT: s_lshr_b32 s9, s17, 16
-; GFX11-NEXT: s_lshr_b32 s7, s16, 16
-; GFX11-NEXT: s_lshr_b32 s6, s3, 16
-; GFX11-NEXT: s_lshr_b32 s8, s2, 16
-; GFX11-NEXT: s_lshr_b32 s4, s1, 16
-; GFX11-NEXT: s_lshr_b32 s5, s0, 16
-; GFX11-NEXT: s_mov_b32 s46, 0
-; GFX11-NEXT: s_and_b32 s47, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB59_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
-; GFX11-NEXT: s_cbranch_vccnz .LBB59_4
-; GFX11-NEXT: .LBB59_2: ; %cmp.true
-; GFX11-NEXT: s_pack_ll_b32_b16 s29, s29, s45
-; GFX11-NEXT: s_pack_ll_b32_b16 s28, s28, s44
-; GFX11-NEXT: s_pack_ll_b32_b16 s27, s27, s43
-; GFX11-NEXT: s_pack_ll_b32_b16 s26, s26, s42
-; GFX11-NEXT: s_pack_ll_b32_b16 s25, s25, s41
-; GFX11-NEXT: s_pack_ll_b32_b16 s24, s24, s40
-; GFX11-NEXT: s_pack_ll_b32_b16 s15, s23, s15
-; GFX11-NEXT: s_pack_ll_b32_b16 s14, s22, s14
-; GFX11-NEXT: s_pack_ll_b32_b16 s13, s21, s13
-; GFX11-NEXT: s_pack_ll_b32_b16 s12, s20, s12
-; GFX11-NEXT: s_pack_ll_b32_b16 s11, s19, s11
-; GFX11-NEXT: s_pack_ll_b32_b16 s10, s18, s10
-; GFX11-NEXT: s_pack_ll_b32_b16 s9, s17, s9
-; GFX11-NEXT: s_pack_ll_b32_b16 s7, s16, s7
-; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s6
-; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s8
-; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s5
-; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s29 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s28 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v16, 0x200, s26 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v17, 0x200, s25 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s24 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v13
-; GFX11-NEXT: s_branch .LBB59_5
-; GFX11-NEXT: .LBB59_3:
-; GFX11-NEXT: s_branch .LBB59_2
-; GFX11-NEXT: .LBB59_4:
-; GFX11-NEXT: v_dual_mov_b32 v13, s29 :: v_dual_mov_b32 v14, s28
-; GFX11-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s26
-; GFX11-NEXT: v_dual_mov_b32 v17, s25 :: v_dual_mov_b32 v8, s24
-; GFX11-NEXT: v_dual_mov_b32 v9, s23 :: v_dual_mov_b32 v10, s22
-; GFX11-NEXT: v_dual_mov_b32 v11, s21 :: v_dual_mov_b32 v12, s20
-; GFX11-NEXT: v_dual_mov_b32 v3, s19 :: v_dual_mov_b32 v4, s18
-; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s16
-; GFX11-NEXT: v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-NEXT: v_dual_mov_b32 v18, s45 :: v_dual_mov_b32 v19, s44
-; GFX11-NEXT: v_dual_mov_b32 v20, s43 :: v_dual_mov_b32 v21, s42
-; GFX11-NEXT: v_dual_mov_b32 v22, s41 :: v_dual_mov_b32 v23, s40
-; GFX11-NEXT: v_dual_mov_b32 v24, s15 :: v_dual_mov_b32 v25, s14
-; GFX11-NEXT: v_dual_mov_b32 v26, s13 :: v_dual_mov_b32 v27, s12
-; GFX11-NEXT: v_dual_mov_b32 v28, s11 :: v_dual_mov_b32 v29, s10
-; GFX11-NEXT: v_dual_mov_b32 v30, s9 :: v_dual_mov_b32 v31, s7
-; GFX11-NEXT: v_dual_mov_b32 v32, s6 :: v_dual_mov_b32 v33, s8
-; GFX11-NEXT: v_dual_mov_b32 v34, s4 :: v_dual_mov_b32 v35, s5
-; GFX11-NEXT: .LBB59_5: ; %end
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT: v_lshl_or_b32 v0, v35, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v1, v34, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v2, v33, 16, v36
-; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v3, v32, 16, v7
-; GFX11-NEXT: v_lshl_or_b32 v4, v31, 16, v6
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT: v_lshl_or_b32 v6, v29, 16, v33
-; GFX11-NEXT: v_lshl_or_b32 v7, v28, 16, v34
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v9
-; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v8
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT: v_lshl_or_b32 v8, v27, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v9, v26, 16, v11
-; GFX11-NEXT: v_lshl_or_b32 v11, v24, 16, v28
-; GFX11-NEXT: v_lshl_or_b32 v12, v23, 16, v29
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v14
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v13
-; GFX11-NEXT: v_lshl_or_b32 v5, v30, 16, v5
-; GFX11-NEXT: v_lshl_or_b32 v10, v25, 16, v10
-; GFX11-NEXT: v_lshl_or_b32 v13, v22, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v14, v21, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v15, v20, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v16, v19, 16, v23
-; GFX11-NEXT: v_lshl_or_b32 v17, v18, 16, v24
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v36f16_to_v36i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_4
+; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s29 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s28 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX11-TRUE16-NEXT: s_branch .LBB59_5
+; GFX11-TRUE16-NEXT: .LBB59_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB59_2
+; GFX11-TRUE16-NEXT: .LBB59_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s45 :: v_dual_mov_b32 v19, s44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s43 :: v_dual_mov_b32 v21, s42
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s41 :: v_dual_mov_b32 v23, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s15 :: v_dual_mov_b32 v25, s14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s13 :: v_dual_mov_b32 v27, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s11 :: v_dual_mov_b32 v29, s10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s9 :: v_dual_mov_b32 v31, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s6 :: v_dual_mov_b32 v33, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s4 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: .LBB59_5: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v21 :: v_dual_mov_b32 v20, v20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v19 :: v_dual_mov_b32 v18, v18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v18.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v36f16_to_v36i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s29, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s28, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s25, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s23, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s22, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s20, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s19, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s18, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s17, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16
+; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB59_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB59_4
+; GFX11-FAKE16-NEXT: .LBB59_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s29 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s28 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s26 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s25 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s24 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v13
+; GFX11-FAKE16-NEXT: s_branch .LBB59_5
+; GFX11-FAKE16-NEXT: .LBB59_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB59_2
+; GFX11-FAKE16-NEXT: .LBB59_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s29 :: v_dual_mov_b32 v14, s28
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s26
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s25 :: v_dual_mov_b32 v8, s24
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s21 :: v_dual_mov_b32 v12, s20
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s19 :: v_dual_mov_b32 v4, s18
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s16
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v0, s2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s45 :: v_dual_mov_b32 v19, s44
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s43 :: v_dual_mov_b32 v21, s42
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s41 :: v_dual_mov_b32 v23, s40
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s15 :: v_dual_mov_b32 v25, s14
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s13 :: v_dual_mov_b32 v27, s12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s11 :: v_dual_mov_b32 v29, s10
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s9 :: v_dual_mov_b32 v31, s7
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s6 :: v_dual_mov_b32 v33, s8
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s4 :: v_dual_mov_b32 v35, s5
+; GFX11-FAKE16-NEXT: .LBB59_5: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v35, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v33, 16, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v32, 16, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v31, 16, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v29, 16, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v28, 16, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v27, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v26, 16, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v24, 16, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v23, 16, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v25, 16, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v22, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v21, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v20, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v19, 16, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v24
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
index 47cb6bd..44cfd6c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
@@ -4913,93 +4913,270 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v20i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s1, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s2, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s25, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s26, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s27, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
+; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v1 :: v_dual_mov_b32 v186, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3
; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB15_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:296
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB15_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB15_2
;
; GFX11-FAKE16-LABEL: bitcast_v40i16_to_v20i32_scalar:
@@ -8342,93 +8519,270 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v20i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s1, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s2, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s25, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s26, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s27, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
+; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v1 :: v_dual_mov_b32 v186, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_3
; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB19_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:296
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB19_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB19_2
;
; GFX11-FAKE16-LABEL: bitcast_v40f16_to_v20i32_scalar:
@@ -11100,142 +11454,271 @@ define inreg <40 x i16> @bitcast_v20f32_to_v40i16_scalar(<20 x float> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr26
; GFX9-NEXT: s_branch .LBB29_2
;
-; GFX11-LABEL: bitcast_v20f32_to_v40i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v19, s1
-; GFX11-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v3, s16
-; GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v9, s17
-; GFX11-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v5, s21
-; GFX11-NEXT: v_dual_mov_b32 v14, s22 :: v_dual_mov_b32 v13, s23
-; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v11, s25
-; GFX11-NEXT: v_dual_mov_b32 v10, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB29_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB29_3
-; GFX11-NEXT: .LBB29_2: ; %cmp.true
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
-; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12
-; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14
-; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT: .LBB29_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_lshl_or_b32 v21, v21, 16, v19
-; GFX11-NEXT: v_lshl_or_b32 v2, v2, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v6
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_lshl_or_b32 v5, v36, 16, v9
-; GFX11-NEXT: v_lshl_or_b32 v6, v35, 16, v8
-; GFX11-NEXT: v_lshl_or_b32 v8, v33, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v9, v32, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v11
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v10
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e32 v48, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT: v_lshl_or_b32 v10, v31, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v11, v30, 16, v13
-; GFX11-NEXT: v_lshl_or_b32 v13, v28, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v14, v27, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v19, v22, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v21
-; GFX11-NEXT: v_lshl_or_b32 v20, v39, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v3, v38, 16, v4
-; GFX11-NEXT: v_lshl_or_b32 v4, v37, 16, v48
-; GFX11-NEXT: v_lshl_or_b32 v7, v34, 16, v7
-; GFX11-NEXT: v_lshl_or_b32 v12, v29, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v15, v26, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v16, v25, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v17, v24, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v18, v23, 16, v0
-; GFX11-NEXT: v_mov_b32_e32 v0, v20
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB29_4:
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr21
-; GFX11-NEXT: ; implicit-def: $vgpr2
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr23
-; GFX11-NEXT: ; implicit-def: $vgpr22
-; GFX11-NEXT: s_branch .LBB29_2
+; GFX11-TRUE16-LABEL: bitcast_v20f32_to_v40i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB29_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB29_3
+; GFX11-TRUE16-NEXT: .LBB29_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-TRUE16-NEXT: .LBB29_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v21 :: v_dual_mov_b32 v20, v20
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v20.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB29_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20
+; GFX11-TRUE16-NEXT: s_branch .LBB29_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v20f32_to_v40i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v19, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v3, s16
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v9, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v5, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s22 :: v_dual_mov_b32 v13, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v11, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB29_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB29_3
+; GFX11-FAKE16-NEXT: .LBB29_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT: .LBB29_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v2, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v36, 16, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v35, 16, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v33, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v32, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v31, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v28, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v27, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v22, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v21
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v38, 16, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v37, 16, v48
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v29, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v26, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v25, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v24, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v23, 16, v0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v20
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB29_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT: s_branch .LBB29_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -12629,93 +13112,270 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v20f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s1, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s2, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s25, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s26, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s27, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
+; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v1 :: v_dual_mov_b32 v186, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB31_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB31_3
; GFX11-TRUE16-NEXT: .LBB31_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB31_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:296
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB31_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB31_2
;
; GFX11-FAKE16-LABEL: bitcast_v40i16_to_v20f32_scalar:
@@ -14269,142 +14929,271 @@ define inreg <40 x half> @bitcast_v20f32_to_v40f16_scalar(<20 x float> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr26
; GFX9-NEXT: s_branch .LBB33_2
;
-; GFX11-LABEL: bitcast_v20f32_to_v40f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v19, s1
-; GFX11-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v3, s16
-; GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v9, s17
-; GFX11-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v5, s21
-; GFX11-NEXT: v_dual_mov_b32 v14, s22 :: v_dual_mov_b32 v13, s23
-; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v11, s25
-; GFX11-NEXT: v_dual_mov_b32 v10, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB33_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB33_3
-; GFX11-NEXT: .LBB33_2: ; %cmp.true
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
-; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12
-; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14
-; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT: .LBB33_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_lshl_or_b32 v21, v21, 16, v19
-; GFX11-NEXT: v_lshl_or_b32 v2, v2, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v6
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_lshl_or_b32 v5, v36, 16, v9
-; GFX11-NEXT: v_lshl_or_b32 v6, v35, 16, v8
-; GFX11-NEXT: v_lshl_or_b32 v8, v33, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v9, v32, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v11
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v10
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e32 v48, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT: v_lshl_or_b32 v10, v31, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v11, v30, 16, v13
-; GFX11-NEXT: v_lshl_or_b32 v13, v28, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v14, v27, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v19, v22, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v21
-; GFX11-NEXT: v_lshl_or_b32 v20, v39, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v3, v38, 16, v4
-; GFX11-NEXT: v_lshl_or_b32 v4, v37, 16, v48
-; GFX11-NEXT: v_lshl_or_b32 v7, v34, 16, v7
-; GFX11-NEXT: v_lshl_or_b32 v12, v29, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v15, v26, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v16, v25, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v17, v24, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v18, v23, 16, v0
-; GFX11-NEXT: v_mov_b32_e32 v0, v20
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB33_4:
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr21
-; GFX11-NEXT: ; implicit-def: $vgpr2
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr23
-; GFX11-NEXT: ; implicit-def: $vgpr22
-; GFX11-NEXT: s_branch .LBB33_2
+; GFX11-TRUE16-LABEL: bitcast_v20f32_to_v40f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB33_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB33_3
+; GFX11-TRUE16-NEXT: .LBB33_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-TRUE16-NEXT: .LBB33_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v21 :: v_dual_mov_b32 v20, v20
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v20.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB33_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20
+; GFX11-TRUE16-NEXT: s_branch .LBB33_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v20f32_to_v40f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v19, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v3, s16
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v9, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v5, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s22 :: v_dual_mov_b32 v13, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v11, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB33_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB33_3
+; GFX11-FAKE16-NEXT: .LBB33_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT: .LBB33_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v2, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v36, 16, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v35, 16, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v33, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v32, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v31, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v28, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v27, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v22, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v21
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v38, 16, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v37, 16, v48
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v29, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v26, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v25, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v24, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v23, 16, v0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v20
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB33_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT: s_branch .LBB33_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -16043,93 +16832,270 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v20f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s1, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s2, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s25, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s26, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s27, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
+; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v1 :: v_dual_mov_b32 v186, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_3
; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB35_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:296
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB35_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB35_2
;
; GFX11-FAKE16-LABEL: bitcast_v40f16_to_v20f32_scalar:
@@ -19655,93 +20621,270 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v10i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s1, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s2, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s25, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s26, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s27, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
+; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v1 :: v_dual_mov_b32 v186, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_3
; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB43_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:296
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB43_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB43_2
;
; GFX11-FAKE16-LABEL: bitcast_v40i16_to_v10i64_scalar:
@@ -23094,93 +24237,270 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v10i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s1, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s2, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s25, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s26, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s27, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
+; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v1 :: v_dual_mov_b32 v186, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_3
; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB47_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:296
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB47_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB47_2
;
; GFX11-FAKE16-LABEL: bitcast_v40f16_to_v10i64_scalar:
@@ -24382,142 +25702,271 @@ define inreg <40 x i16> @bitcast_v10f64_to_v40i16_scalar(<10 x double> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr26
; GFX9-NEXT: s_branch .LBB49_2
;
-; GFX11-LABEL: bitcast_v10f64_to_v40i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v21, s1
-; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3
-; GFX11-NEXT: v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v19, s17
-; GFX11-NEXT: v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19
-; GFX11-NEXT: v_dual_mov_b32 v5, s20 :: v_dual_mov_b32 v6, s21
-; GFX11-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v13, s23
-; GFX11-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v11, s25
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB49_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB49_3
-; GFX11-NEXT: .LBB49_2: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
-; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
-; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
-; GFX11-NEXT: v_add_f64 v[5:6], v[5:6], 1.0
-; GFX11-NEXT: v_add_f64 v[7:8], v[7:8], 1.0
-; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT: v_add_f64 v[3:4], v[3:4], 1.0
-; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT: .LBB49_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT: v_lshl_or_b32 v2, v2, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v3, v37, 16, v4
-; GFX11-NEXT: v_lshl_or_b32 v4, v36, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v5
-; GFX11-NEXT: v_lshl_or_b32 v21, v38, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v6
-; GFX11-NEXT: v_lshl_or_b32 v6, v34, 16, v7
-; GFX11-NEXT: v_lshl_or_b32 v7, v33, 16, v8
-; GFX11-NEXT: v_lshl_or_b32 v8, v32, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v11
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshl_or_b32 v5, v35, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v10
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_lshl_or_b32 v11, v30, 16, v13
-; GFX11-NEXT: v_lshl_or_b32 v13, v28, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v19, v22, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v21
-; GFX11-NEXT: v_lshl_or_b32 v20, v39, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v9, v9, 16, v36
-; GFX11-NEXT: v_lshl_or_b32 v10, v31, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v12, v29, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v14, v27, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v15, v26, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v16, v25, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v17, v24, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v18, v23, 16, v0
-; GFX11-NEXT: v_mov_b32_e32 v0, v20
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB49_4:
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr2
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr9
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr23
-; GFX11-NEXT: ; implicit-def: $vgpr22
-; GFX11-NEXT: s_branch .LBB49_2
+; GFX11-TRUE16-LABEL: bitcast_v10f64_to_v40i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_3
+; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-TRUE16-NEXT: .LBB49_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v21 :: v_dual_mov_b32 v20, v20
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v20.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB49_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20
+; GFX11-TRUE16-NEXT: s_branch .LBB49_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v10f64_to_v40i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v21, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v19, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s20 :: v_dual_mov_b32 v6, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v13, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v11, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_3
+; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[5:6], v[5:6], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[3:4], v[3:4], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT: .LBB49_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v2, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v37, 16, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v36, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v33, 16, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v32, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v35, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v28, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v22, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v21
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v9, 16, v36
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v31, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v29, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v27, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v26, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v25, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v24, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v23, 16, v0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v20
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB49_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT: s_branch .LBB49_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -25911,93 +27360,270 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v10f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s1, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s2, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s25, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s26, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s27, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
+; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v1 :: v_dual_mov_b32 v186, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3
; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB51_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:296
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB51_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB51_2
;
; GFX11-FAKE16-LABEL: bitcast_v40i16_to_v10f64_scalar:
@@ -27484,142 +29110,271 @@ define inreg <40 x half> @bitcast_v10f64_to_v40f16_scalar(<10 x double> inreg %a
; GFX9-NEXT: ; implicit-def: $vgpr26
; GFX9-NEXT: s_branch .LBB53_2
;
-; GFX11-LABEL: bitcast_v10f64_to_v40f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v21, s1
-; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3
-; GFX11-NEXT: v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v19, s17
-; GFX11-NEXT: v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19
-; GFX11-NEXT: v_dual_mov_b32 v5, s20 :: v_dual_mov_b32 v6, s21
-; GFX11-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v13, s23
-; GFX11-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v11, s25
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB53_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB53_3
-; GFX11-NEXT: .LBB53_2: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
-; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
-; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
-; GFX11-NEXT: v_add_f64 v[5:6], v[5:6], 1.0
-; GFX11-NEXT: v_add_f64 v[7:8], v[7:8], 1.0
-; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT: v_add_f64 v[3:4], v[3:4], 1.0
-; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT: .LBB53_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT: v_lshl_or_b32 v2, v2, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v3, v37, 16, v4
-; GFX11-NEXT: v_lshl_or_b32 v4, v36, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v5
-; GFX11-NEXT: v_lshl_or_b32 v21, v38, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v6
-; GFX11-NEXT: v_lshl_or_b32 v6, v34, 16, v7
-; GFX11-NEXT: v_lshl_or_b32 v7, v33, 16, v8
-; GFX11-NEXT: v_lshl_or_b32 v8, v32, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v11
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshl_or_b32 v5, v35, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v10
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_lshl_or_b32 v11, v30, 16, v13
-; GFX11-NEXT: v_lshl_or_b32 v13, v28, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v19, v22, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v21
-; GFX11-NEXT: v_lshl_or_b32 v20, v39, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v9, v9, 16, v36
-; GFX11-NEXT: v_lshl_or_b32 v10, v31, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v12, v29, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v14, v27, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v15, v26, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v16, v25, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v17, v24, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v18, v23, 16, v0
-; GFX11-NEXT: v_mov_b32_e32 v0, v20
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB53_4:
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr2
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr9
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr23
-; GFX11-NEXT: ; implicit-def: $vgpr22
-; GFX11-NEXT: s_branch .LBB53_2
+; GFX11-TRUE16-LABEL: bitcast_v10f64_to_v40f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB53_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB53_3
+; GFX11-TRUE16-NEXT: .LBB53_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-TRUE16-NEXT: .LBB53_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v21 :: v_dual_mov_b32 v20, v20
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v20.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB53_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20
+; GFX11-TRUE16-NEXT: s_branch .LBB53_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v10f64_to_v40f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v21, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v19, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s20 :: v_dual_mov_b32 v6, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v13, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v11, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB53_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB53_3
+; GFX11-FAKE16-NEXT: .LBB53_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[5:6], v[5:6], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[3:4], v[3:4], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT: .LBB53_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v2, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v37, 16, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v36, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v33, 16, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v32, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v35, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v28, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v22, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v21
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v9, 16, v36
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v31, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v29, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v27, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v26, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v25, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v24, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v23, 16, v0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v20
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB53_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT: s_branch .LBB53_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -29258,93 +31013,270 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v10f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s1, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s2, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s25, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s26, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s27, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:172
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:44
+; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v1 :: v_dual_mov_b32 v186, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB55_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB55_3
; GFX11-TRUE16-NEXT: .LBB55_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB55_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, v170
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xa
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:296
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB55_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB55_2
;
; GFX11-FAKE16-LABEL: bitcast_v40f16_to_v10f64_scalar:
@@ -31057,12 +32989,10 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v40f16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, 0
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v19.h
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16
@@ -31083,17 +33013,16 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v20.h
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_4
; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40
@@ -31109,59 +33038,61 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s5
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s29, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s29, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s28, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s27, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s26, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s12, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
; GFX11-TRUE16-NEXT: s_branch .LBB57_5
; GFX11-TRUE16-NEXT: .LBB57_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20
; GFX11-TRUE16-NEXT: s_branch .LBB57_2
; GFX11-TRUE16-NEXT: .LBB57_4:
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s29 :: v_dual_mov_b32 v16, s28
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s27 :: v_dual_mov_b32 v10, s26
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s25 :: v_dual_mov_b32 v12, s24
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s23 :: v_dual_mov_b32 v14, s22
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s21 :: v_dual_mov_b32 v6, s20
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s18
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s17 :: v_dual_mov_b32 v2, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s2
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s1 :: v_dual_mov_b32 v21, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s45 :: v_dual_mov_b32 v23, s44
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s43 :: v_dual_mov_b32 v25, s42
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s41 :: v_dual_mov_b32 v27, s40
@@ -31172,47 +33103,37 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, s6 :: v_dual_mov_b32 v37, s5
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s4 :: v_dual_mov_b32 v39, s9
; GFX11-TRUE16-NEXT: .LBB57_5: ; %end
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v37, 16, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v35, 16, v49
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v36, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v33, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v31, 16, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v21
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v34, 16, v9
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v30, 16, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v29, 16, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v25, 16, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v32, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v28, 16, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v27, 16, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v26, 16, v30
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v24, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v23, 16, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v22, 16, v25
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v20 :: v_dual_mov_b32 v1, v21
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v21 :: v_dual_mov_b32 v20, v20
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v20.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v40i16_to_v40f16_scalar:
@@ -32879,12 +34800,10 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v40i16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, 0
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v19.h
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16
@@ -32905,17 +34824,16 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v20.h
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_4
; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40
@@ -32931,59 +34849,61 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s5
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s29 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s29 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s28 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s27 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s26 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s12 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
; GFX11-TRUE16-NEXT: s_branch .LBB59_5
; GFX11-TRUE16-NEXT: .LBB59_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20
; GFX11-TRUE16-NEXT: s_branch .LBB59_2
; GFX11-TRUE16-NEXT: .LBB59_4:
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s29 :: v_dual_mov_b32 v16, s28
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s27 :: v_dual_mov_b32 v10, s26
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s25 :: v_dual_mov_b32 v12, s24
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s23 :: v_dual_mov_b32 v14, s22
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s21 :: v_dual_mov_b32 v6, s20
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s18
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s17 :: v_dual_mov_b32 v2, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s2
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s1 :: v_dual_mov_b32 v21, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s45 :: v_dual_mov_b32 v23, s44
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s43 :: v_dual_mov_b32 v25, s42
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s41 :: v_dual_mov_b32 v27, s40
@@ -32994,47 +34914,37 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, s6 :: v_dual_mov_b32 v37, s5
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s4 :: v_dual_mov_b32 v39, s9
; GFX11-TRUE16-NEXT: .LBB59_5: ; %end
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v37, 16, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v35, 16, v49
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v36, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v33, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v31, 16, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v21
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v34, 16, v9
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v30, 16, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v29, 16, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v25, 16, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v32, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v28, 16, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v27, 16, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v26, 16, v30
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v24, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v23, 16, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v22, 16, v25
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v20 :: v_dual_mov_b32 v1, v21
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v21 :: v_dual_mov_b32 v20, v20
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v20.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v40f16_to_v40i16_scalar:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
index ecc715c..14e17ce 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
@@ -2411,66 +2411,123 @@ define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg
; GFX9-NEXT: v_mov_b32_e32 v1, s17
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v4bf16_to_i64_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-NEXT: s_mov_b32 s2, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB23_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
-; GFX11-NEXT: s_cbranch_vccnz .LBB23_4
-; GFX11-NEXT: .LBB23_2: ; %cmp.true
-; GFX11-NEXT: s_pack_lh_b32_b16 s2, 0, s0
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s1, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s1, 0, s1
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB23_3:
-; GFX11-NEXT: s_branch .LBB23_2
-; GFX11-NEXT: .LBB23_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_i64_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB23_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB23_4
+; GFX11-TRUE16-NEXT: .LBB23_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v5 :: v_dual_add_nc_u32 v9, v9, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v9.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB23_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB23_2
+; GFX11-TRUE16-NEXT: .LBB23_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_i64_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB23_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB23_4
+; GFX11-FAKE16-NEXT: .LBB23_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s1, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s1, 0, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB23_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB23_2
+; GFX11-FAKE16-NEXT: .LBB23_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -3067,9 +3124,9 @@ define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -3085,52 +3142,47 @@ define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2
; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -5547,66 +5599,123 @@ define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inr
; GFX9-NEXT: v_mov_b32_e32 v1, s17
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v4bf16_to_f64_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-NEXT: s_mov_b32 s2, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB47_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
-; GFX11-NEXT: s_cbranch_vccnz .LBB47_4
-; GFX11-NEXT: .LBB47_2: ; %cmp.true
-; GFX11-NEXT: s_pack_lh_b32_b16 s2, 0, s0
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s1, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s1, 0, s1
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB47_3:
-; GFX11-NEXT: s_branch .LBB47_2
-; GFX11-NEXT: .LBB47_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_f64_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_4
+; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v5 :: v_dual_add_nc_u32 v9, v9, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v9.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB47_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB47_2
+; GFX11-TRUE16-NEXT: .LBB47_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_f64_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB47_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB47_4
+; GFX11-FAKE16-NEXT: .LBB47_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s1, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s1, 0, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB47_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB47_2
+; GFX11-FAKE16-NEXT: .LBB47_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -6210,9 +6319,9 @@ define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -6228,52 +6337,47 @@ define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2
; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -8396,66 +8500,123 @@ define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v1, s17
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v4bf16_to_v2i32_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-NEXT: s_mov_b32 s2, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB67_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
-; GFX11-NEXT: s_cbranch_vccnz .LBB67_4
-; GFX11-NEXT: .LBB67_2: ; %cmp.true
-; GFX11-NEXT: s_pack_lh_b32_b16 s2, 0, s0
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s1, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s1, 0, s1
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB67_3:
-; GFX11-NEXT: s_branch .LBB67_2
-; GFX11-NEXT: .LBB67_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_v2i32_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB67_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB67_4
+; GFX11-TRUE16-NEXT: .LBB67_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v5 :: v_dual_add_nc_u32 v9, v9, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v9.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB67_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB67_2
+; GFX11-TRUE16-NEXT: .LBB67_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_v2i32_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB67_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB67_4
+; GFX11-FAKE16-NEXT: .LBB67_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s1, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s1, 0, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB67_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB67_2
+; GFX11-FAKE16-NEXT: .LBB67_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -9050,9 +9211,9 @@ define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -9068,52 +9229,47 @@ define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2
; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -10920,66 +11076,123 @@ define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v1, s17
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v4bf16_to_v2f32_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-NEXT: s_mov_b32 s2, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB83_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
-; GFX11-NEXT: s_cbranch_vccnz .LBB83_4
-; GFX11-NEXT: .LBB83_2: ; %cmp.true
-; GFX11-NEXT: s_pack_lh_b32_b16 s2, 0, s0
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s1, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s1, 0, s1
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB83_3:
-; GFX11-NEXT: s_branch .LBB83_2
-; GFX11-NEXT: .LBB83_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_v2f32_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB83_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB83_4
+; GFX11-TRUE16-NEXT: .LBB83_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v5 :: v_dual_add_nc_u32 v9, v9, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v9.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB83_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB83_2
+; GFX11-TRUE16-NEXT: .LBB83_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_v2f32_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB83_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB83_4
+; GFX11-FAKE16-NEXT: .LBB83_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s1, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s1, 0, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB83_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB83_2
+; GFX11-FAKE16-NEXT: .LBB83_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -11590,9 +11803,9 @@ define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -11608,52 +11821,47 @@ define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2
; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -12834,47 +13042,40 @@ define <4 x i16> @bitcast_v4bf16_to_v4i16(<4 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v4, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v2, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v3, v5 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v9, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v0, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v2, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
; GFX11-TRUE16-NEXT: .LBB94_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -13091,60 +13292,112 @@ define inreg <4 x i16> @bitcast_v4bf16_to_v4i16_scalar(<4 x bfloat> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v1, s17
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v4bf16_to_v4i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-NEXT: s_mov_b32 s2, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB95_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
-; GFX11-NEXT: s_cbranch_vccnz .LBB95_4
-; GFX11-NEXT: .LBB95_2: ; %cmp.true
-; GFX11-NEXT: s_pack_lh_b32_b16 s2, 0, s1
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s1, s0, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s0
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s1
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v3, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB95_3:
-; GFX11-NEXT: s_branch .LBB95_2
-; GFX11-NEXT: .LBB95_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_v4i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB95_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB95_4
+; GFX11-TRUE16-NEXT: .LBB95_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB95_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB95_2
+; GFX11-TRUE16-NEXT: .LBB95_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_v4i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB95_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB95_4
+; GFX11-FAKE16-NEXT: .LBB95_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s2, 0, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v3, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB95_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB95_2
+; GFX11-FAKE16-NEXT: .LBB95_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -13809,9 +14062,9 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -13827,52 +14080,47 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB98_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_2
; GFX11-TRUE16-NEXT: .LBB98_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -14914,65 +15162,124 @@ define inreg <4 x half> @bitcast_v4bf16_to_v4f16_scalar(<4 x bfloat> inreg %a, i
; GFX9-NEXT: v_mov_b32_e32 v1, s17
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v4bf16_to_v4f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-NEXT: s_mov_b32 s2, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB103_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
-; GFX11-NEXT: s_cbranch_vccnz .LBB103_4
-; GFX11-NEXT: .LBB103_2: ; %cmp.true
-; GFX11-NEXT: s_pack_lh_b32_b16 s2, 0, s1
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
-; GFX11-NEXT: s_lshl_b32 s1, s0, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s0
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s1
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_dual_cndmask_b32 v3, v4, v8 :: v_dual_and_b32 v2, 0xffff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB103_3:
-; GFX11-NEXT: s_branch .LBB103_2
-; GFX11-NEXT: .LBB103_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_v4f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB103_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB103_4
+; GFX11-TRUE16-NEXT: .LBB103_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, v5, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v4.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB103_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB103_2
+; GFX11-TRUE16-NEXT: .LBB103_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_v4f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB103_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB103_4
+; GFX11-FAKE16-NEXT: .LBB103_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s2, 0, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v4, v8 :: v_dual_and_b32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v4, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB103_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB103_2
+; GFX11-FAKE16-NEXT: .LBB103_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -15630,9 +15937,9 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -15648,52 +15955,47 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB106_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB106_2
; GFX11-TRUE16-NEXT: .LBB106_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -16644,88 +16946,172 @@ define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32
; GFX9-NEXT: v_mov_b32_e32 v4, s17
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v4bf16_to_v8i8_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB109_3
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: s_lshr_b64 s[2:3], s[0:1], 24
-; GFX11-NEXT: s_lshr_b32 s6, s1, 24
-; GFX11-NEXT: s_lshr_b32 s8, s1, 16
-; GFX11-NEXT: s_lshr_b32 s7, s1, 8
-; GFX11-NEXT: s_lshr_b32 s5, s0, 16
-; GFX11-NEXT: s_lshr_b32 s3, s0, 8
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB109_4
-; GFX11-NEXT: .LBB109_2: ; %cmp.true
-; GFX11-NEXT: s_pack_lh_b32_b16 s2, 0, s0
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT: s_lshl_b32 s0, s1, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s1, 0, s1
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, v3, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v8
-; GFX11-NEXT: v_lshl_or_b32 v9, v2, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshl_or_b32 v10, v6, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[9:10]
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v10
-; GFX11-NEXT: v_mov_b32_e32 v4, v8
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB109_3:
-; GFX11-NEXT: ; implicit-def: $sgpr3
-; GFX11-NEXT: ; implicit-def: $sgpr5
-; GFX11-NEXT: ; implicit-def: $sgpr2
-; GFX11-NEXT: ; implicit-def: $sgpr7
-; GFX11-NEXT: ; implicit-def: $sgpr8
-; GFX11-NEXT: ; implicit-def: $sgpr6
-; GFX11-NEXT: s_branch .LBB109_2
-; GFX11-NEXT: .LBB109_4:
-; GFX11-NEXT: v_dual_mov_b32 v6, s8 :: v_dual_mov_b32 v7, s6
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s7
-; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_v8i8_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB109_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s1, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s1, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s0, 8
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB109_4
+; GFX11-TRUE16-NEXT: .LBB109_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s1, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v3, v7 :: v_dual_add_nc_u32 v6, v6, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v0, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v1.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v8.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v6.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[9:10]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v10
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v8
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB109_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr3
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6
+; GFX11-TRUE16-NEXT: s_branch .LBB109_2
+; GFX11-TRUE16-NEXT: .LBB109_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s8 :: v_dual_mov_b32 v7, s6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v1, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_v8i8_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB109_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s1, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s1, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s1, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s0, 8
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB109_4
+; GFX11-FAKE16-NEXT: .LBB109_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s1, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s1, 0, s1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, v3, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v2, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v6, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[9:10]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v10
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v8
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB109_3:
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr3
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr2
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6
+; GFX11-FAKE16-NEXT: s_branch .LBB109_2
+; GFX11-FAKE16-NEXT: .LBB109_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s8 :: v_dual_mov_b32 v7, s6
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s7
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v1, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -16934,9 +17320,9 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -16952,52 +17338,47 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB110_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB110_2
; GFX11-TRUE16-NEXT: .LBB110_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
index 2cc7c44..87d5157 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
@@ -5328,105 +5328,278 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v22i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
+; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v3 :: v_dual_mov_b32 v186, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v1 :: v_dual_mov_b32 v188, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3
; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB15_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:304
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB15_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB15_2
;
; GFX11-FAKE16-LABEL: bitcast_v44i16_to_v22i32_scalar:
@@ -9137,105 +9310,278 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v22i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
+; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v3 :: v_dual_mov_b32 v186, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v1 :: v_dual_mov_b32 v188, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_3
; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB19_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:304
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB19_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB19_2
;
; GFX11-FAKE16-LABEL: bitcast_v44f16_to_v22i32_scalar:
@@ -12099,155 +12445,295 @@ define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr30
; GFX9-NEXT: s_branch .LBB29_2
;
-; GFX11-LABEL: bitcast_v22f32_to_v44i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v21, s1
-; GFX11-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v19, s3
-; GFX11-NEXT: v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v5, s18
-; GFX11-NEXT: v_dual_mov_b32 v6, s17 :: v_dual_mov_b32 v11, s19
-; GFX11-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT: v_dual_mov_b32 v8, s22 :: v_dual_mov_b32 v7, s23
-; GFX11-NEXT: v_dual_mov_b32 v15, s24 :: v_dual_mov_b32 v14, s25
-; GFX11-NEXT: v_dual_mov_b32 v13, s26 :: v_dual_mov_b32 v12, s27
-; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB29_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v22
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB29_3
-; GFX11-NEXT: .LBB29_2: ; %cmp.true
-; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
-; GFX11-NEXT: v_dual_add_f32 v12, 1.0, v12 :: v_dual_add_f32 v13, 1.0, v13
-; GFX11-NEXT: v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v15, 1.0, v15
-; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v22
-; GFX11-NEXT: .LBB29_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT: v_lshl_or_b32 v23, v23, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v7
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_lshl_or_b32 v25, v25, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v5
-; GFX11-NEXT: v_lshl_or_b32 v4, v4, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v8
-; GFX11-NEXT: v_lshl_or_b32 v7, v48, 16, v11
-; GFX11-NEXT: v_lshl_or_b32 v11, v36, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v12
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v5, v50, 16, v6
-; GFX11-NEXT: v_lshl_or_b32 v6, v49, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT: v_lshl_or_b32 v8, v39, 16, v10
-; GFX11-NEXT: v_lshl_or_b32 v10, v37, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v13
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_lshl_or_b32 v12, v35, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v15, v32, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v19, v28, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v21, v26, 16, v3
-; GFX11-NEXT: v_mov_b32_e32 v1, v25
-; GFX11-NEXT: v_lshl_or_b32 v24, v24, 16, v22
-; GFX11-NEXT: v_mov_b32_e32 v3, v23
-; GFX11-NEXT: v_lshl_or_b32 v22, v51, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v9, v38, 16, v9
-; GFX11-NEXT: v_lshl_or_b32 v13, v34, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v14, v33, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v16, v31, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v17, v30, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v18, v29, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v20, v27, 16, v2
-; GFX11-NEXT: v_mov_b32_e32 v0, v24
-; GFX11-NEXT: v_mov_b32_e32 v2, v22
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB29_4:
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr23
-; GFX11-NEXT: ; implicit-def: $vgpr4
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: s_branch .LBB29_2
+; GFX11-TRUE16-LABEL: bitcast_v22f32_to_v44i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB29_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB29_3
+; GFX11-TRUE16-NEXT: .LBB29_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-TRUE16-NEXT: .LBB29_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v22.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB29_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-TRUE16-NEXT: s_branch .LBB29_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v22f32_to_v44i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v21, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v19, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v5, s18
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s17 :: v_dual_mov_b32 v11, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s22 :: v_dual_mov_b32 v7, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s24 :: v_dual_mov_b32 v14, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s26 :: v_dual_mov_b32 v12, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB29_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB29_3
+; GFX11-FAKE16-NEXT: .LBB29_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v12, 1.0, v12 :: v_dual_add_f32 v13, 1.0, v13
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v15, 1.0, v15
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22
+; GFX11-FAKE16-NEXT: .LBB29_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v23, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v48, 16, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v36, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v50, 16, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v49, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v39, 16, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v37, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v35, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v32, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v28, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v26, 16, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v25
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v22
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v51, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v34, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v33, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v29, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v27, 16, v2
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v24
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v22
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB29_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: s_branch .LBB29_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -13805,105 +14291,278 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v22f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
+; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v3 :: v_dual_mov_b32 v186, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v1 :: v_dual_mov_b32 v188, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB31_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB31_3
; GFX11-TRUE16-NEXT: .LBB31_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB31_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:304
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB31_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB31_2
;
; GFX11-FAKE16-LABEL: bitcast_v44i16_to_v22f32_scalar:
@@ -15630,155 +16289,295 @@ define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr30
; GFX9-NEXT: s_branch .LBB33_2
;
-; GFX11-LABEL: bitcast_v22f32_to_v44f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v21, s1
-; GFX11-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v19, s3
-; GFX11-NEXT: v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v5, s18
-; GFX11-NEXT: v_dual_mov_b32 v6, s17 :: v_dual_mov_b32 v11, s19
-; GFX11-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT: v_dual_mov_b32 v8, s22 :: v_dual_mov_b32 v7, s23
-; GFX11-NEXT: v_dual_mov_b32 v15, s24 :: v_dual_mov_b32 v14, s25
-; GFX11-NEXT: v_dual_mov_b32 v13, s26 :: v_dual_mov_b32 v12, s27
-; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB33_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v22
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB33_3
-; GFX11-NEXT: .LBB33_2: ; %cmp.true
-; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
-; GFX11-NEXT: v_dual_add_f32 v12, 1.0, v12 :: v_dual_add_f32 v13, 1.0, v13
-; GFX11-NEXT: v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v15, 1.0, v15
-; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v22
-; GFX11-NEXT: .LBB33_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT: v_lshl_or_b32 v23, v23, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v7
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_lshl_or_b32 v25, v25, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v5
-; GFX11-NEXT: v_lshl_or_b32 v4, v4, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v8
-; GFX11-NEXT: v_lshl_or_b32 v7, v48, 16, v11
-; GFX11-NEXT: v_lshl_or_b32 v11, v36, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v12
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v5, v50, 16, v6
-; GFX11-NEXT: v_lshl_or_b32 v6, v49, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT: v_lshl_or_b32 v8, v39, 16, v10
-; GFX11-NEXT: v_lshl_or_b32 v10, v37, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v13
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_lshl_or_b32 v12, v35, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v15, v32, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v19, v28, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v21, v26, 16, v3
-; GFX11-NEXT: v_mov_b32_e32 v1, v25
-; GFX11-NEXT: v_lshl_or_b32 v24, v24, 16, v22
-; GFX11-NEXT: v_mov_b32_e32 v3, v23
-; GFX11-NEXT: v_lshl_or_b32 v22, v51, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v9, v38, 16, v9
-; GFX11-NEXT: v_lshl_or_b32 v13, v34, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v14, v33, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v16, v31, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v17, v30, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v18, v29, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v20, v27, 16, v2
-; GFX11-NEXT: v_mov_b32_e32 v0, v24
-; GFX11-NEXT: v_mov_b32_e32 v2, v22
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB33_4:
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr23
-; GFX11-NEXT: ; implicit-def: $vgpr4
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: s_branch .LBB33_2
+; GFX11-TRUE16-LABEL: bitcast_v22f32_to_v44f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB33_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB33_3
+; GFX11-TRUE16-NEXT: .LBB33_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-TRUE16-NEXT: .LBB33_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v22.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB33_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-TRUE16-NEXT: s_branch .LBB33_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v22f32_to_v44f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v21, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v19, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v5, s18
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s17 :: v_dual_mov_b32 v11, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s22 :: v_dual_mov_b32 v7, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s24 :: v_dual_mov_b32 v14, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s26 :: v_dual_mov_b32 v12, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB33_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB33_3
+; GFX11-FAKE16-NEXT: .LBB33_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v12, 1.0, v12 :: v_dual_add_f32 v13, 1.0, v13
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v15, 1.0, v15
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22
+; GFX11-FAKE16-NEXT: .LBB33_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v23, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v48, 16, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v36, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v50, 16, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v49, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v39, 16, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v37, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v35, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v32, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v28, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v26, 16, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v25
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v22
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v51, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v34, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v33, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v29, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v27, 16, v2
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v24
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v22
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB33_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: s_branch .LBB33_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -17607,105 +18406,278 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v22f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
+; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v3 :: v_dual_mov_b32 v186, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v1 :: v_dual_mov_b32 v188, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_3
; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB35_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:304
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB35_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB35_2
;
; GFX11-FAKE16-LABEL: bitcast_v44f16_to_v22f32_scalar:
@@ -21568,105 +22540,278 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v11i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
+; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v3 :: v_dual_mov_b32 v186, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v1 :: v_dual_mov_b32 v188, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_3
; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB43_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:304
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB43_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB43_2
;
; GFX11-FAKE16-LABEL: bitcast_v44i16_to_v11i64_scalar:
@@ -25389,105 +26534,278 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v11i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
+; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v3 :: v_dual_mov_b32 v186, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v1 :: v_dual_mov_b32 v188, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_3
; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB47_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:304
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB47_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB47_2
;
; GFX11-FAKE16-LABEL: bitcast_v44f16_to_v11i64_scalar:
@@ -26793,154 +28111,294 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr30
; GFX9-NEXT: s_branch .LBB49_2
;
-; GFX11-LABEL: bitcast_v11f64_to_v44i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v23, s1
-; GFX11-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v21, s3
-; GFX11-NEXT: v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v6, s17
-; GFX11-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19
-; GFX11-NEXT: v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21
-; GFX11-NEXT: v_dual_mov_b32 v7, s22 :: v_dual_mov_b32 v8, s23
-; GFX11-NEXT: v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v15, s25
-; GFX11-NEXT: v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v13, s27
-; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB49_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v22
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB49_3
-; GFX11-NEXT: .LBB49_2: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
-; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
-; GFX11-NEXT: v_add_f64 v[7:8], v[7:8], 1.0
-; GFX11-NEXT: v_add_f64 v[9:10], v[9:10], 1.0
-; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT: v_add_f64 v[5:6], v[5:6], 1.0
-; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v22
-; GFX11-NEXT: .LBB49_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_lshl_or_b32 v25, v25, 16, v23
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_lshl_or_b32 v23, v50, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_lshl_or_b32 v4, v4, 16, v5
-; GFX11-NEXT: v_lshl_or_b32 v5, v49, 16, v6
-; GFX11-NEXT: v_lshl_or_b32 v6, v48, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v7
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v24, v24, 16, v22
-; GFX11-NEXT: v_lshl_or_b32 v7, v39, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v12
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v21, v26, 16, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v23
-; GFX11-NEXT: v_lshl_or_b32 v22, v51, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v8
-; GFX11-NEXT: v_lshl_or_b32 v8, v38, 16, v9
-; GFX11-NEXT: v_lshl_or_b32 v9, v37, 16, v10
-; GFX11-NEXT: v_lshl_or_b32 v10, v36, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-NEXT: v_lshl_or_b32 v11, v11, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v12, v35, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v13, v34, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v14, v33, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v15, v32, 16, v19
-; GFX11-NEXT: v_lshl_or_b32 v16, v31, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v17, v30, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v18, v29, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v19, v28, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v20, v27, 16, v2
-; GFX11-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25
-; GFX11-NEXT: v_mov_b32_e32 v2, v22
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB49_4:
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr4
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr11
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: s_branch .LBB49_2
+; GFX11-TRUE16-LABEL: bitcast_v11f64_to_v44i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_3
+; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-TRUE16-NEXT: .LBB49_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v22.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB49_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-TRUE16-NEXT: s_branch .LBB49_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v11f64_to_v44i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v23, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v21, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v6, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s22 :: v_dual_mov_b32 v8, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v15, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v13, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_3
+; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[5:6], v[5:6], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22
+; GFX11-FAKE16-NEXT: .LBB49_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v50, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v48, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v39, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v26, 16, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v51, 16, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v37, 16, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v36, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v11, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v35, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v34, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v33, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v32, 16, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v29, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v28, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v27, 16, v2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v22
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB49_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: s_branch .LBB49_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -28498,105 +29956,278 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v11f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
+; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v3 :: v_dual_mov_b32 v186, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v1 :: v_dual_mov_b32 v188, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3
; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB51_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:304
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB51_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB51_2
;
; GFX11-FAKE16-LABEL: bitcast_v44i16_to_v11f64_scalar:
@@ -30248,154 +31879,294 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a
; GFX9-NEXT: ; implicit-def: $vgpr30
; GFX9-NEXT: s_branch .LBB53_2
;
-; GFX11-LABEL: bitcast_v11f64_to_v44f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v23, s1
-; GFX11-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v21, s3
-; GFX11-NEXT: v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v6, s17
-; GFX11-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19
-; GFX11-NEXT: v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21
-; GFX11-NEXT: v_dual_mov_b32 v7, s22 :: v_dual_mov_b32 v8, s23
-; GFX11-NEXT: v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v15, s25
-; GFX11-NEXT: v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v13, s27
-; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB53_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v22
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB53_3
-; GFX11-NEXT: .LBB53_2: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
-; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
-; GFX11-NEXT: v_add_f64 v[7:8], v[7:8], 1.0
-; GFX11-NEXT: v_add_f64 v[9:10], v[9:10], 1.0
-; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT: v_add_f64 v[5:6], v[5:6], 1.0
-; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v22
-; GFX11-NEXT: .LBB53_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_lshl_or_b32 v25, v25, 16, v23
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_lshl_or_b32 v23, v50, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_lshl_or_b32 v4, v4, 16, v5
-; GFX11-NEXT: v_lshl_or_b32 v5, v49, 16, v6
-; GFX11-NEXT: v_lshl_or_b32 v6, v48, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v7
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v24, v24, 16, v22
-; GFX11-NEXT: v_lshl_or_b32 v7, v39, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v12
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v21, v26, 16, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v23
-; GFX11-NEXT: v_lshl_or_b32 v22, v51, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v8
-; GFX11-NEXT: v_lshl_or_b32 v8, v38, 16, v9
-; GFX11-NEXT: v_lshl_or_b32 v9, v37, 16, v10
-; GFX11-NEXT: v_lshl_or_b32 v10, v36, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-NEXT: v_lshl_or_b32 v11, v11, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v12, v35, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v13, v34, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v14, v33, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v15, v32, 16, v19
-; GFX11-NEXT: v_lshl_or_b32 v16, v31, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v17, v30, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v18, v29, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v19, v28, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v20, v27, 16, v2
-; GFX11-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25
-; GFX11-NEXT: v_mov_b32_e32 v2, v22
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB53_4:
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr4
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr11
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: s_branch .LBB53_2
+; GFX11-TRUE16-LABEL: bitcast_v11f64_to_v44f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB53_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB53_3
+; GFX11-TRUE16-NEXT: .LBB53_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-TRUE16-NEXT: .LBB53_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v22.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB53_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-TRUE16-NEXT: s_branch .LBB53_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v11f64_to_v44f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v23, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v21, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v6, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s22 :: v_dual_mov_b32 v8, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v15, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v13, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB53_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB53_3
+; GFX11-FAKE16-NEXT: .LBB53_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[5:6], v[5:6], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22
+; GFX11-FAKE16-NEXT: .LBB53_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v50, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v48, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v39, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v26, 16, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v51, 16, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v37, 16, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v36, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v11, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v35, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v34, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v33, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v32, 16, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v29, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v28, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v27, 16, v2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v22
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB53_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: s_branch .LBB53_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -32224,105 +33995,278 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v11f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:180
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:52
+; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v3 :: v_dual_mov_b32 v186, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v1 :: v_dual_mov_b32 v188, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB55_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB55_3
; GFX11-TRUE16-NEXT: .LBB55_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB55_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v21, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xc
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:304
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB55_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB55_2
;
; GFX11-FAKE16-LABEL: bitcast_v44f16_to_v11f64_scalar:
@@ -34283,15 +36227,10 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v44f16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v21.h
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, 0
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
@@ -34313,19 +36252,18 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v21.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v20.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v22.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v22.h
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_4
; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42
@@ -34343,63 +36281,67 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s12
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s29, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s28, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s28, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s25, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s24, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s14, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v6
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21
; GFX11-TRUE16-NEXT: s_branch .LBB57_5
; GFX11-TRUE16-NEXT: .LBB57_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22
; GFX11-TRUE16-NEXT: s_branch .LBB57_2
; GFX11-TRUE16-NEXT: .LBB57_4:
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v12, s28
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s27 :: v_dual_mov_b32 v14, s26
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s25 :: v_dual_mov_b32 v16, s24
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s23 :: v_dual_mov_b32 v8, s22
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s20
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s19 :: v_dual_mov_b32 v4, s18
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s3 :: v_dual_mov_b32 v23, s2
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s1 :: v_dual_mov_b32 v25, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s45 :: v_dual_mov_b32 v27, s44
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s43 :: v_dual_mov_b32 v29, s42
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s41 :: v_dual_mov_b32 v31, s40
@@ -34410,53 +36352,40 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, s5 :: v_dual_mov_b32 v49, s7
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s4 :: v_dual_mov_b32 v51, s12
; GFX11-TRUE16-NEXT: .LBB57_5: ; %end
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v51, 16, v25
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v50, 16, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v49, 16, v23
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v48, 16, v53
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v38, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v23
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v39, 16, v6
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v37, 16, v50
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v36, 16, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v35, 16, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v32, 16, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v33, 16, v37
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v34, 16, v9
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v31, 16, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v30, 16, v15
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v29, 16, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v28, 16, v32
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v27, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v26, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v2
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v22
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v22.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v44i16_to_v44f16_scalar:
@@ -36279,15 +38208,10 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v44i16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v21.h
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, 0
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
@@ -36309,19 +38233,18 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v21.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v20.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v22.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v22.h
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_4
; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42
@@ -36339,63 +38262,67 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s12
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s29 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s28 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s28 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s25 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s24 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s14 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s9 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v6
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21
; GFX11-TRUE16-NEXT: s_branch .LBB59_5
; GFX11-TRUE16-NEXT: .LBB59_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22
; GFX11-TRUE16-NEXT: s_branch .LBB59_2
; GFX11-TRUE16-NEXT: .LBB59_4:
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v12, s28
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s27 :: v_dual_mov_b32 v14, s26
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s25 :: v_dual_mov_b32 v16, s24
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s23 :: v_dual_mov_b32 v8, s22
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s20
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s19 :: v_dual_mov_b32 v4, s18
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s3 :: v_dual_mov_b32 v23, s2
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s1 :: v_dual_mov_b32 v25, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s45 :: v_dual_mov_b32 v27, s44
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s43 :: v_dual_mov_b32 v29, s42
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s41 :: v_dual_mov_b32 v31, s40
@@ -36406,53 +38333,40 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, s5 :: v_dual_mov_b32 v49, s7
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s4 :: v_dual_mov_b32 v51, s12
; GFX11-TRUE16-NEXT: .LBB59_5: ; %end
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v51, 16, v25
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v50, 16, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v49, 16, v23
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v48, 16, v53
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v38, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v23
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v39, 16, v6
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v37, 16, v50
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v36, 16, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v35, 16, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v32, 16, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v33, 16, v37
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v34, 16, v9
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v31, 16, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v30, 16, v15
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v29, 16, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v28, 16, v32
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v27, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v26, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v2
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v22
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v22.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v44f16_to_v44i16_scalar:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
index c35e183..fb2e94f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
@@ -5805,117 +5805,286 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v24i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v5
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
+; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v5 :: v_dual_mov_b32 v186, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v3 :: v_dual_mov_b32 v188, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v1 :: v_dual_mov_b32 v190, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3
; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB15_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v20, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:312
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB15_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB15_2
;
; GFX11-FAKE16-LABEL: bitcast_v48i16_to_v24i32_scalar:
@@ -10044,117 +10213,286 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v24i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v5
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
+; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v5 :: v_dual_mov_b32 v186, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v3 :: v_dual_mov_b32 v188, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v1 :: v_dual_mov_b32 v190, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_3
; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB19_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v20, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:312
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB19_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB19_2
;
; GFX11-FAKE16-LABEL: bitcast_v48f16_to_v24i32_scalar:
@@ -13212,166 +13550,317 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr34
; GFX9-NEXT: s_branch .LBB29_2
;
-; GFX11-LABEL: bitcast_v24f32_to_v48i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v23, s1
-; GFX11-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v21, s3
-; GFX11-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v19, s17
-; GFX11-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v7, s20
-; GFX11-NEXT: v_dual_mov_b32 v8, s19 :: v_dual_mov_b32 v13, s21
-; GFX11-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v9, s25
-; GFX11-NEXT: v_dual_mov_b32 v15, s26 :: v_dual_mov_b32 v14, s27
-; GFX11-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB29_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v24
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB29_3
-; GFX11-NEXT: .LBB29_2: ; %cmp.true
-; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
-; GFX11-NEXT: v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v15, 1.0, v15
-; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12
-; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
-; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v24
-; GFX11-NEXT: .LBB29_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_lshl_or_b32 v6, v6, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v10
-; GFX11-NEXT: v_lshl_or_b32 v10, v51, 16, v12
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v25, v25, 16, v23
-; GFX11-NEXT: v_lshl_or_b32 v12, v49, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v14
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT: v_lshl_or_b32 v27, v27, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_lshl_or_b32 v29, v29, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v9
-; GFX11-NEXT: v_lshl_or_b32 v14, v39, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v15, v38, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v18, v35, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v26, v26, 16, v22
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v7
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_lshl_or_b32 v9, v52, 16, v13
-; GFX11-NEXT: v_lshl_or_b32 v13, v48, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v4
-; GFX11-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5
-; GFX11-NEXT: v_lshl_or_b32 v28, v28, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v20, v33, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v25
-; GFX11-NEXT: v_lshl_or_b32 v24, v55, 16, v24
-; GFX11-NEXT: v_lshl_or_b32 v7, v54, 16, v8
-; GFX11-NEXT: v_lshl_or_b32 v8, v53, 16, v21
-; GFX11-NEXT: v_lshl_or_b32 v11, v50, 16, v11
-; GFX11-NEXT: v_lshl_or_b32 v16, v37, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v17, v36, 16, v19
-; GFX11-NEXT: v_lshl_or_b32 v19, v34, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v21, v32, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v22, v31, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v23, v30, 16, v4
-; GFX11-NEXT: v_mov_b32_e32 v0, v24
-; GFX11-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
-; GFX11-NEXT: v_mov_b32_e32 v4, v28
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB29_4:
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr6
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: s_branch .LBB29_2
+; GFX11-TRUE16-LABEL: bitcast_v24f32_to_v48i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB29_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB29_3
+; GFX11-TRUE16-NEXT: .LBB29_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-TRUE16-NEXT: .LBB29_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v24.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB29_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT: s_branch .LBB29_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v24f32_to_v48i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v23, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v21, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v19, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v7, s20
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s19 :: v_dual_mov_b32 v13, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v9, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s26 :: v_dual_mov_b32 v14, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB29_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB29_3
+; GFX11-FAKE16-NEXT: .LBB29_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v15, 1.0, v15
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24
+; GFX11-FAKE16-NEXT: .LBB29_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v51, 16, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v49, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v27, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v39, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v38, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v52, 16, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v48, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v28, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v25
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v55, 16, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v54, 16, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v53, 16, v21
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v37, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v36, 16, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v31, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v30, 16, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v24
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB29_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: s_branch .LBB29_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -15153,117 +15642,286 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v24f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v5
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
+; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v5 :: v_dual_mov_b32 v186, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v3 :: v_dual_mov_b32 v188, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v1 :: v_dual_mov_b32 v190, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB31_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB31_3
; GFX11-TRUE16-NEXT: .LBB31_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB31_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v20, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:312
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB31_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB31_2
;
; GFX11-FAKE16-LABEL: bitcast_v48i16_to_v24f32_scalar:
@@ -17167,166 +17825,317 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr34
; GFX9-NEXT: s_branch .LBB33_2
;
-; GFX11-LABEL: bitcast_v24f32_to_v48f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v23, s1
-; GFX11-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v21, s3
-; GFX11-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v19, s17
-; GFX11-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v7, s20
-; GFX11-NEXT: v_dual_mov_b32 v8, s19 :: v_dual_mov_b32 v13, s21
-; GFX11-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v9, s25
-; GFX11-NEXT: v_dual_mov_b32 v15, s26 :: v_dual_mov_b32 v14, s27
-; GFX11-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB33_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v24
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB33_3
-; GFX11-NEXT: .LBB33_2: ; %cmp.true
-; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
-; GFX11-NEXT: v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v15, 1.0, v15
-; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12
-; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
-; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v24
-; GFX11-NEXT: .LBB33_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_lshl_or_b32 v6, v6, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v10
-; GFX11-NEXT: v_lshl_or_b32 v10, v51, 16, v12
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v25, v25, 16, v23
-; GFX11-NEXT: v_lshl_or_b32 v12, v49, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v14
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT: v_lshl_or_b32 v27, v27, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_lshl_or_b32 v29, v29, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v9
-; GFX11-NEXT: v_lshl_or_b32 v14, v39, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v15, v38, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v18, v35, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v26, v26, 16, v22
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v7
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_lshl_or_b32 v9, v52, 16, v13
-; GFX11-NEXT: v_lshl_or_b32 v13, v48, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v4
-; GFX11-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5
-; GFX11-NEXT: v_lshl_or_b32 v28, v28, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v20, v33, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v25
-; GFX11-NEXT: v_lshl_or_b32 v24, v55, 16, v24
-; GFX11-NEXT: v_lshl_or_b32 v7, v54, 16, v8
-; GFX11-NEXT: v_lshl_or_b32 v8, v53, 16, v21
-; GFX11-NEXT: v_lshl_or_b32 v11, v50, 16, v11
-; GFX11-NEXT: v_lshl_or_b32 v16, v37, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v17, v36, 16, v19
-; GFX11-NEXT: v_lshl_or_b32 v19, v34, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v21, v32, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v22, v31, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v23, v30, 16, v4
-; GFX11-NEXT: v_mov_b32_e32 v0, v24
-; GFX11-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
-; GFX11-NEXT: v_mov_b32_e32 v4, v28
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB33_4:
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr6
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: s_branch .LBB33_2
+; GFX11-TRUE16-LABEL: bitcast_v24f32_to_v48f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB33_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB33_3
+; GFX11-TRUE16-NEXT: .LBB33_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-TRUE16-NEXT: .LBB33_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v24.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB33_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT: s_branch .LBB33_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v24f32_to_v48f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v23, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v21, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v19, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v7, s20
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s19 :: v_dual_mov_b32 v13, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v9, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s26 :: v_dual_mov_b32 v14, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB33_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB33_3
+; GFX11-FAKE16-NEXT: .LBB33_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v15, 1.0, v15
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24
+; GFX11-FAKE16-NEXT: .LBB33_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v51, 16, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v49, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v27, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v39, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v38, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v52, 16, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v48, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v28, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v25
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v55, 16, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v54, 16, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v53, 16, v21
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v37, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v36, 16, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v31, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v30, 16, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v24
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB33_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: s_branch .LBB33_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -19382,117 +20191,286 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v24f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v5
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
+; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v5 :: v_dual_mov_b32 v186, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v3 :: v_dual_mov_b32 v188, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v1 :: v_dual_mov_b32 v190, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_3
; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB35_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v20, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:312
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB35_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB35_2
;
; GFX11-FAKE16-LABEL: bitcast_v48f16_to_v24f32_scalar:
@@ -23764,117 +24742,286 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v12i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v5
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
+; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v5 :: v_dual_mov_b32 v186, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v3 :: v_dual_mov_b32 v188, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v1 :: v_dual_mov_b32 v190, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_3
; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB43_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v20, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:312
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB43_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB43_2
;
; GFX11-FAKE16-LABEL: bitcast_v48i16_to_v12i64_scalar:
@@ -28015,117 +29162,286 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v12i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v5
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
+; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v5 :: v_dual_mov_b32 v186, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v3 :: v_dual_mov_b32 v188, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v1 :: v_dual_mov_b32 v190, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_3
; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB47_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v20, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:312
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB47_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB47_2
;
; GFX11-FAKE16-LABEL: bitcast_v48f16_to_v12i64_scalar:
@@ -29551,166 +30867,317 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr34
; GFX9-NEXT: s_branch .LBB49_2
;
-; GFX11-LABEL: bitcast_v12f64_to_v48i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v25, s1
-; GFX11-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v23, s3
-; GFX11-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v21, s17
-; GFX11-NEXT: v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19
-; GFX11-NEXT: v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v19, s21
-; GFX11-NEXT: v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23
-; GFX11-NEXT: v_dual_mov_b32 v9, s24 :: v_dual_mov_b32 v10, s25
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB49_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v24
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB49_3
-; GFX11-NEXT: .LBB49_2: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
-; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
-; GFX11-NEXT: v_add_f64 v[9:10], v[9:10], 1.0
-; GFX11-NEXT: v_add_f64 v[11:12], v[11:12], 1.0
-; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT: v_add_f64 v[7:8], v[7:8], 1.0
-; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
-; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v24
-; GFX11-NEXT: .LBB49_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT: v_lshl_or_b32 v29, v29, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_lshl_or_b32 v6, v6, 16, v7
-; GFX11-NEXT: v_lshl_or_b32 v7, v53, 16, v8
-; GFX11-NEXT: v_lshl_or_b32 v8, v52, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v25, v54, 16, v25
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT: v_lshl_or_b32 v27, v27, 16, v23
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v9
-; GFX11-NEXT: v_lshl_or_b32 v9, v51, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v18, v35, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v4
-; GFX11-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5
-; GFX11-NEXT: v_lshl_or_b32 v28, v28, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v10
-; GFX11-NEXT: v_lshl_or_b32 v26, v26, 16, v22
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_lshl_or_b32 v13, v13, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_lshl_or_b32 v20, v33, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v25
-; GFX11-NEXT: v_lshl_or_b32 v24, v55, 16, v24
-; GFX11-NEXT: v_lshl_or_b32 v10, v50, 16, v11
-; GFX11-NEXT: v_lshl_or_b32 v11, v49, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v12, v48, 16, v19
-; GFX11-NEXT: v_lshl_or_b32 v14, v39, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v15, v38, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v16, v37, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v17, v36, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v19, v34, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v21, v32, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v22, v31, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v23, v30, 16, v4
-; GFX11-NEXT: v_mov_b32_e32 v0, v24
-; GFX11-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
-; GFX11-NEXT: v_mov_b32_e32 v4, v28
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB49_4:
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr6
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr13
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: s_branch .LBB49_2
+; GFX11-TRUE16-LABEL: bitcast_v12f64_to_v48i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_3
+; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-TRUE16-NEXT: .LBB49_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v24.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB49_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT: s_branch .LBB49_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v12f64_to_v48i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v25, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v23, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v21, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v19, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s24 :: v_dual_mov_b32 v10, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_3
+; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[11:12], v[11:12], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24
+; GFX11-FAKE16-NEXT: .LBB49_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v53, 16, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v52, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v54, 16, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v27, 16, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v51, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v28, 16, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v25
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v55, 16, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v49, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v48, 16, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v39, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v38, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v37, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v36, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v31, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v30, 16, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v24
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB49_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: s_branch .LBB49_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -31492,117 +32959,286 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v12f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v5
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
+; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v5 :: v_dual_mov_b32 v186, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v3 :: v_dual_mov_b32 v188, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v1 :: v_dual_mov_b32 v190, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3
; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB51_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v20, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:312
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB51_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB51_2
;
; GFX11-FAKE16-LABEL: bitcast_v48i16_to_v12f64_scalar:
@@ -33424,166 +35060,317 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a
; GFX9-NEXT: ; implicit-def: $vgpr34
; GFX9-NEXT: s_branch .LBB53_2
;
-; GFX11-LABEL: bitcast_v12f64_to_v48f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v25, s1
-; GFX11-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v23, s3
-; GFX11-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v21, s17
-; GFX11-NEXT: v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19
-; GFX11-NEXT: v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v19, s21
-; GFX11-NEXT: v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23
-; GFX11-NEXT: v_dual_mov_b32 v9, s24 :: v_dual_mov_b32 v10, s25
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB53_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v24
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB53_3
-; GFX11-NEXT: .LBB53_2: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
-; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
-; GFX11-NEXT: v_add_f64 v[9:10], v[9:10], 1.0
-; GFX11-NEXT: v_add_f64 v[11:12], v[11:12], 1.0
-; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT: v_add_f64 v[7:8], v[7:8], 1.0
-; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
-; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v24
-; GFX11-NEXT: .LBB53_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT: v_lshl_or_b32 v29, v29, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_lshl_or_b32 v6, v6, 16, v7
-; GFX11-NEXT: v_lshl_or_b32 v7, v53, 16, v8
-; GFX11-NEXT: v_lshl_or_b32 v8, v52, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v25, v54, 16, v25
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT: v_lshl_or_b32 v27, v27, 16, v23
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v9
-; GFX11-NEXT: v_lshl_or_b32 v9, v51, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v18, v35, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v4
-; GFX11-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5
-; GFX11-NEXT: v_lshl_or_b32 v28, v28, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v10
-; GFX11-NEXT: v_lshl_or_b32 v26, v26, 16, v22
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_lshl_or_b32 v13, v13, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_lshl_or_b32 v20, v33, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v25
-; GFX11-NEXT: v_lshl_or_b32 v24, v55, 16, v24
-; GFX11-NEXT: v_lshl_or_b32 v10, v50, 16, v11
-; GFX11-NEXT: v_lshl_or_b32 v11, v49, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v12, v48, 16, v19
-; GFX11-NEXT: v_lshl_or_b32 v14, v39, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v15, v38, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v16, v37, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v17, v36, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v19, v34, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v21, v32, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v22, v31, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v23, v30, 16, v4
-; GFX11-NEXT: v_mov_b32_e32 v0, v24
-; GFX11-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
-; GFX11-NEXT: v_mov_b32_e32 v4, v28
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB53_4:
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr6
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr13
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: s_branch .LBB53_2
+; GFX11-TRUE16-LABEL: bitcast_v12f64_to_v48f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB53_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB53_3
+; GFX11-TRUE16-NEXT: .LBB53_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-TRUE16-NEXT: .LBB53_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v24.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB53_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT: s_branch .LBB53_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v12f64_to_v48f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v25, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v23, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v21, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v19, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s24 :: v_dual_mov_b32 v10, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB53_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB53_3
+; GFX11-FAKE16-NEXT: .LBB53_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[11:12], v[11:12], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24
+; GFX11-FAKE16-NEXT: .LBB53_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v53, 16, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v52, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v54, 16, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v27, 16, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v51, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v28, 16, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v25
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v55, 16, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v49, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v48, 16, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v39, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v38, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v37, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v36, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v31, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v30, 16, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v24
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB53_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: s_branch .LBB53_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -35639,117 +37426,286 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v12f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v5
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:188
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:60
+; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v5 :: v_dual_mov_b32 v186, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v3 :: v_dual_mov_b32 v188, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v1 :: v_dual_mov_b32 v190, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB55_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB55_3
; GFX11-TRUE16-NEXT: .LBB55_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB55_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v20, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v23, v185
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xe
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:312
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB55_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB55_2
;
; GFX11-FAKE16-LABEL: bitcast_v48f16_to_v12f64_scalar:
@@ -37964,19 +39920,11 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v48f16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v23.h
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
@@ -37998,22 +39946,21 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v23.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v24.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v21.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v24.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v20.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v24.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v24.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v24.h
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_4
; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v23, 16, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v22, 16, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43
@@ -38032,67 +39979,73 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s14
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s29, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s28, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s27, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s26, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s29, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s28, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v27
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v8
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23
; GFX11-TRUE16-NEXT: s_branch .LBB57_5
; GFX11-TRUE16-NEXT: .LBB57_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24
; GFX11-TRUE16-NEXT: s_branch .LBB57_2
; GFX11-TRUE16-NEXT: .LBB57_4:
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s29 :: v_dual_mov_b32 v15, s28
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s27 :: v_dual_mov_b32 v17, s26
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s25 :: v_dual_mov_b32 v10, s24
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s22
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s21 :: v_dual_mov_b32 v6, s20
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s18
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s17 :: v_dual_mov_b32 v29, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s3 :: v_dual_mov_b32 v25, s2
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s1 :: v_dual_mov_b32 v27, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s45 :: v_dual_mov_b32 v31, s44
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s43 :: v_dual_mov_b32 v33, s42
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s41 :: v_dual_mov_b32 v35, s40
@@ -38103,58 +40056,43 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s5 :: v_dual_mov_b32 v53, s7
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s4 :: v_dual_mov_b32 v55, s14
; GFX11-TRUE16-NEXT: .LBB57_5: ; %end
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v55, 16, v27
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v52, 16, v65
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v54, 16, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v22, 16, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v27
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v53, 16, v64
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v49, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v48, 16, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v39, 16, v53
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v51, 16, v29
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v50, 16, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v38, 16, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v37, 16, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v35, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v34, 16, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v36, 16, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v33, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v32, 16, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v31, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v30, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v4
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v26
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v28 :: v_dual_mov_b32 v5, v29
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v24.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v48i16_to_v48f16_scalar:
@@ -40168,19 +42106,11 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v48i16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v23.h
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
@@ -40202,22 +42132,21 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v23.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v24.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v21.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v24.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v20.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v24.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v24.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v24.h
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_4
; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v23, 16, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v22, 16, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43
@@ -40236,67 +42165,73 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s14
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s29 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s28 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s27 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s26 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s25 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s29 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s28 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s11 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v27
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v8
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23
; GFX11-TRUE16-NEXT: s_branch .LBB59_5
; GFX11-TRUE16-NEXT: .LBB59_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24
; GFX11-TRUE16-NEXT: s_branch .LBB59_2
; GFX11-TRUE16-NEXT: .LBB59_4:
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s29 :: v_dual_mov_b32 v15, s28
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s27 :: v_dual_mov_b32 v17, s26
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s25 :: v_dual_mov_b32 v10, s24
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s22
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s21 :: v_dual_mov_b32 v6, s20
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s18
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s17 :: v_dual_mov_b32 v29, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s3 :: v_dual_mov_b32 v25, s2
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s1 :: v_dual_mov_b32 v27, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s45 :: v_dual_mov_b32 v31, s44
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s43 :: v_dual_mov_b32 v33, s42
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s41 :: v_dual_mov_b32 v35, s40
@@ -40307,58 +42242,43 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s5 :: v_dual_mov_b32 v53, s7
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s4 :: v_dual_mov_b32 v55, s14
; GFX11-TRUE16-NEXT: .LBB59_5: ; %end
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v55, 16, v27
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v52, 16, v65
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v54, 16, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v22, 16, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v27
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v53, 16, v64
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v49, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v48, 16, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v39, 16, v53
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v51, 16, v29
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v50, 16, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v38, 16, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v37, 16, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v35, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v34, 16, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v36, 16, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v33, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v32, 16, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v31, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v30, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v4
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v26
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v28 :: v_dual_mov_b32 v5, v29
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v24.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v48f16_to_v48i16_scalar:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
index 29005a4..07cdbef 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
@@ -6286,129 +6286,295 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v26i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v7
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v186, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v5 :: v_dual_mov_b32 v188, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v3 :: v_dual_mov_b32 v190, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v191, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v185, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3
; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB15_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB15_4:
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v53, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v25, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB15_2
;
; GFX11-FAKE16-LABEL: bitcast_v52i16_to_v26i32_scalar:
@@ -10946,129 +11112,295 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v26i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v7
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v186, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v5 :: v_dual_mov_b32 v188, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v3 :: v_dual_mov_b32 v190, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v191, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v185, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_3
; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB19_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB19_4:
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v53, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v25, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB19_2
;
; GFX11-FAKE16-LABEL: bitcast_v52f16_to_v26i32_scalar:
@@ -14389,178 +14721,340 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr38
; GFX9-NEXT: s_branch .LBB29_2
;
-; GFX11-LABEL: bitcast_v26f32_to_v52i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-NEXT: v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v25, s1
-; GFX11-NEXT: v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v23, s3
-; GFX11-NEXT: v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v21, s17
-; GFX11-NEXT: v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v19, s19
-; GFX11-NEXT: v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v9, s22
-; GFX11-NEXT: v_dual_mov_b32 v10, s21 :: v_dual_mov_b32 v15, s23
-; GFX11-NEXT: v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT: v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v11, s27
-; GFX11-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB29_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v26
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB29_3
-; GFX11-NEXT: .LBB29_2: ; %cmp.true
-; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
-; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12
-; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14
-; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
-; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
-; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v26
-; GFX11-NEXT: .LBB29_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_lshl_or_b32 v31, v31, 16, v19
-; GFX11-NEXT: v_lshl_or_b32 v8, v8, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v12
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v11
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshl_or_b32 v33, v33, 16, v25
-; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT: v_lshl_or_b32 v27, v27, 16, v23
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT: v_lshl_or_b32 v29, v29, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_lshl_or_b32 v11, v64, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v12, v55, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v14, v53, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v15, v52, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v19, v48, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v9
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT: v_lshl_or_b32 v16, v51, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v17, v50, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v18, v49, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v6
-; GFX11-NEXT: v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v4, 0xffff, v7
-; GFX11-NEXT: v_lshl_or_b32 v30, v30, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v20, v39, 16, v2
-; GFX11-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5
-; GFX11-NEXT: v_lshl_or_b32 v28, v28, 16, v22
-; GFX11-NEXT: v_lshl_or_b32 v22, v37, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v33
-; GFX11-NEXT: v_lshl_or_b32 v32, v32, 16, v26
-; GFX11-NEXT: v_lshl_or_b32 v26, v67, 16, v24
-; GFX11-NEXT: v_lshl_or_b32 v9, v66, 16, v10
-; GFX11-NEXT: v_lshl_or_b32 v10, v65, 16, v21
-; GFX11-NEXT: v_lshl_or_b32 v13, v54, 16, v13
-; GFX11-NEXT: v_lshl_or_b32 v21, v38, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v23, v36, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v24, v35, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v25, v34, 16, v4
-; GFX11-NEXT: v_mov_b32_e32 v0, v32
-; GFX11-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
-; GFX11-NEXT: v_mov_b32_e32 v4, v28
-; GFX11-NEXT: v_mov_b32_e32 v6, v30
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB29_4:
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr8
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: s_branch .LBB29_2
+; GFX11-TRUE16-LABEL: bitcast_v26f32_to_v52i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB29_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB29_3
+; GFX11-TRUE16-NEXT: .LBB29_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-TRUE16-NEXT: .LBB29_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v26.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB29_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT: s_branch .LBB29_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v26f32_to_v52i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v25, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v23, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v21, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v19, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v9, s22
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s21 :: v_dual_mov_b32 v15, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v11, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB29_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB29_3
+; GFX11-FAKE16-NEXT: .LBB29_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26
+; GFX11-FAKE16-NEXT: .LBB29_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v8, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v27, 16, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v64, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v55, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v53, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v52, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v51, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v50, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v4, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v28, 16, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v26
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v67, 16, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v66, 16, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v65, 16, v21
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v32
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v30
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB29_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr8
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: s_branch .LBB29_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -16527,129 +17021,295 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v26f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v7
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v186, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v5 :: v_dual_mov_b32 v188, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v3 :: v_dual_mov_b32 v190, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v191, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v185, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB31_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB31_3
; GFX11-TRUE16-NEXT: .LBB31_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB31_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB31_4:
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v53, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v25, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB31_2
;
; GFX11-FAKE16-LABEL: bitcast_v52i16_to_v26f32_scalar:
@@ -18769,178 +19429,340 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr38
; GFX9-NEXT: s_branch .LBB33_2
;
-; GFX11-LABEL: bitcast_v26f32_to_v52f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-NEXT: v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v25, s1
-; GFX11-NEXT: v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v23, s3
-; GFX11-NEXT: v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v21, s17
-; GFX11-NEXT: v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v19, s19
-; GFX11-NEXT: v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v9, s22
-; GFX11-NEXT: v_dual_mov_b32 v10, s21 :: v_dual_mov_b32 v15, s23
-; GFX11-NEXT: v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT: v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v11, s27
-; GFX11-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB33_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v26
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB33_3
-; GFX11-NEXT: .LBB33_2: ; %cmp.true
-; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
-; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12
-; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14
-; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
-; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
-; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v26
-; GFX11-NEXT: .LBB33_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_lshl_or_b32 v31, v31, 16, v19
-; GFX11-NEXT: v_lshl_or_b32 v8, v8, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v12
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v11
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshl_or_b32 v33, v33, 16, v25
-; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT: v_lshl_or_b32 v27, v27, 16, v23
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT: v_lshl_or_b32 v29, v29, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_lshl_or_b32 v11, v64, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v12, v55, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v14, v53, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v15, v52, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v19, v48, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v9
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT: v_lshl_or_b32 v16, v51, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v17, v50, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v18, v49, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v6
-; GFX11-NEXT: v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v4, 0xffff, v7
-; GFX11-NEXT: v_lshl_or_b32 v30, v30, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v20, v39, 16, v2
-; GFX11-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5
-; GFX11-NEXT: v_lshl_or_b32 v28, v28, 16, v22
-; GFX11-NEXT: v_lshl_or_b32 v22, v37, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v33
-; GFX11-NEXT: v_lshl_or_b32 v32, v32, 16, v26
-; GFX11-NEXT: v_lshl_or_b32 v26, v67, 16, v24
-; GFX11-NEXT: v_lshl_or_b32 v9, v66, 16, v10
-; GFX11-NEXT: v_lshl_or_b32 v10, v65, 16, v21
-; GFX11-NEXT: v_lshl_or_b32 v13, v54, 16, v13
-; GFX11-NEXT: v_lshl_or_b32 v21, v38, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v23, v36, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v24, v35, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v25, v34, 16, v4
-; GFX11-NEXT: v_mov_b32_e32 v0, v32
-; GFX11-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
-; GFX11-NEXT: v_mov_b32_e32 v4, v28
-; GFX11-NEXT: v_mov_b32_e32 v6, v30
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB33_4:
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr8
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: s_branch .LBB33_2
+; GFX11-TRUE16-LABEL: bitcast_v26f32_to_v52f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB33_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB33_3
+; GFX11-TRUE16-NEXT: .LBB33_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-TRUE16-NEXT: .LBB33_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v26.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB33_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT: s_branch .LBB33_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v26f32_to_v52f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v25, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v23, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v21, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v19, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v9, s22
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s21 :: v_dual_mov_b32 v15, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v11, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB33_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB33_3
+; GFX11-FAKE16-NEXT: .LBB33_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26
+; GFX11-FAKE16-NEXT: .LBB33_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v8, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v27, 16, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v64, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v55, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v53, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v52, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v51, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v50, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v4, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v28, 16, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v26
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v67, 16, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v66, 16, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v65, 16, v21
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v32
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v30
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB33_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr8
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: s_branch .LBB33_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -21183,129 +22005,295 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v26f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v7
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v186, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v5 :: v_dual_mov_b32 v188, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v3 :: v_dual_mov_b32 v190, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v191, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v185, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_3
; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB35_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB35_4:
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v53, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v25, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB35_2
;
; GFX11-FAKE16-LABEL: bitcast_v52f16_to_v26f32_scalar:
@@ -25980,129 +26968,295 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v13i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v7
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v186, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v5 :: v_dual_mov_b32 v188, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v3 :: v_dual_mov_b32 v190, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v191, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v185, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_3
; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB43_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB43_4:
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v53, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v25, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB43_2
;
; GFX11-FAKE16-LABEL: bitcast_v52i16_to_v13i64_scalar:
@@ -30655,129 +31809,295 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v13i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v7
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v186, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v5 :: v_dual_mov_b32 v188, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v3 :: v_dual_mov_b32 v190, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v191, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v185, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_3
; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB47_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB47_4:
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v53, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v25, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB47_2
;
; GFX11-FAKE16-LABEL: bitcast_v52f16_to_v13i64_scalar:
@@ -32378,178 +33698,340 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr25
; GFX9-NEXT: s_branch .LBB49_2
;
-; GFX11-LABEL: bitcast_v13f64_to_v52i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-NEXT: v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v27, s1
-; GFX11-NEXT: v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v25, s3
-; GFX11-NEXT: v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v23, s17
-; GFX11-NEXT: v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v21, s19
-; GFX11-NEXT: v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21
-; GFX11-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v19, s23
-; GFX11-NEXT: v_dual_mov_b32 v13, s24 :: v_dual_mov_b32 v14, s25
-; GFX11-NEXT: v_dual_mov_b32 v11, s26 :: v_dual_mov_b32 v12, s27
-; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB49_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v26
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB49_3
-; GFX11-NEXT: .LBB49_2: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
-; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
-; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT: v_add_f64 v[11:12], v[11:12], 1.0
-; GFX11-NEXT: v_add_f64 v[13:14], v[13:14], 1.0
-; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT: v_add_f64 v[9:10], v[9:10], 1.0
-; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
-; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
-; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v26
-; GFX11-NEXT: .LBB49_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT: v_lshl_or_b32 v31, v31, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_lshl_or_b32 v8, v8, 16, v9
-; GFX11-NEXT: v_lshl_or_b32 v9, v65, 16, v10
-; GFX11-NEXT: v_lshl_or_b32 v10, v64, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v11
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshl_or_b32 v33, v33, 16, v27
-; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT: v_lshl_or_b32 v27, v66, 16, v25
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT: v_lshl_or_b32 v29, v29, 16, v23
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_lshl_or_b32 v30, v30, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v12
-; GFX11-NEXT: v_lshl_or_b32 v12, v54, 16, v13
-; GFX11-NEXT: v_lshl_or_b32 v13, v53, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v14, v52, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v19, v48, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v4
-; GFX11-NEXT: v_lshl_or_b32 v11, v55, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v15, v15, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_lshl_or_b32 v18, v49, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v20, v39, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v6
-; GFX11-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v7
-; GFX11-NEXT: v_lshl_or_b32 v28, v28, 16, v22
-; GFX11-NEXT: v_lshl_or_b32 v22, v37, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v33
-; GFX11-NEXT: v_lshl_or_b32 v32, v32, 16, v26
-; GFX11-NEXT: v_lshl_or_b32 v26, v67, 16, v24
-; GFX11-NEXT: v_lshl_or_b32 v16, v51, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v17, v50, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v21, v38, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v23, v36, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v24, v35, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v25, v34, 16, v4
-; GFX11-NEXT: v_mov_b32_e32 v0, v32
-; GFX11-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
-; GFX11-NEXT: v_mov_b32_e32 v4, v28
-; GFX11-NEXT: v_dual_mov_b32 v6, v30 :: v_dual_mov_b32 v7, v31
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB49_4:
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr8
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr15
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: s_branch .LBB49_2
+; GFX11-TRUE16-LABEL: bitcast_v13f64_to_v52i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_3
+; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-TRUE16-NEXT: .LBB49_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v26.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB49_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT: s_branch .LBB49_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v13f64_to_v52i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v27, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v25, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v23, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v21, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v19, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s24 :: v_dual_mov_b32 v14, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s26 :: v_dual_mov_b32 v12, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_3
+; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[11:12], v[11:12], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[13:14], v[13:14], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26
+; GFX11-FAKE16-NEXT: .LBB49_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v8, 16, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v65, 16, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v64, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v27
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v66, 16, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v53, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v52, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v55, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v15, 16, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v28, 16, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v26
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v67, 16, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v51, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v50, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v32
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v30 :: v_dual_mov_b32 v7, v31
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB49_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr8
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr15
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: s_branch .LBB49_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -34516,129 +35998,295 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v13f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v7
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v186, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v5 :: v_dual_mov_b32 v188, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v3 :: v_dual_mov_b32 v190, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v191, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v185, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3
; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB51_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB51_4:
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v53, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v25, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB51_2
;
; GFX11-FAKE16-LABEL: bitcast_v52i16_to_v13f64_scalar:
@@ -36667,178 +38315,340 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a
; GFX9-NEXT: ; implicit-def: $vgpr25
; GFX9-NEXT: s_branch .LBB53_2
;
-; GFX11-LABEL: bitcast_v13f64_to_v52f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-NEXT: v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v27, s1
-; GFX11-NEXT: v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v25, s3
-; GFX11-NEXT: v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v23, s17
-; GFX11-NEXT: v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v21, s19
-; GFX11-NEXT: v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21
-; GFX11-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v19, s23
-; GFX11-NEXT: v_dual_mov_b32 v13, s24 :: v_dual_mov_b32 v14, s25
-; GFX11-NEXT: v_dual_mov_b32 v11, s26 :: v_dual_mov_b32 v12, s27
-; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB53_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v26
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB53_3
-; GFX11-NEXT: .LBB53_2: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
-; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
-; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT: v_add_f64 v[11:12], v[11:12], 1.0
-; GFX11-NEXT: v_add_f64 v[13:14], v[13:14], 1.0
-; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT: v_add_f64 v[9:10], v[9:10], 1.0
-; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
-; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
-; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v26
-; GFX11-NEXT: .LBB53_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT: v_lshl_or_b32 v31, v31, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_lshl_or_b32 v8, v8, 16, v9
-; GFX11-NEXT: v_lshl_or_b32 v9, v65, 16, v10
-; GFX11-NEXT: v_lshl_or_b32 v10, v64, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v11
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshl_or_b32 v33, v33, 16, v27
-; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT: v_lshl_or_b32 v27, v66, 16, v25
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT: v_lshl_or_b32 v29, v29, 16, v23
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_lshl_or_b32 v30, v30, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v12
-; GFX11-NEXT: v_lshl_or_b32 v12, v54, 16, v13
-; GFX11-NEXT: v_lshl_or_b32 v13, v53, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v14, v52, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v19, v48, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v4
-; GFX11-NEXT: v_lshl_or_b32 v11, v55, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v15, v15, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_lshl_or_b32 v18, v49, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v20, v39, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v6
-; GFX11-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v7
-; GFX11-NEXT: v_lshl_or_b32 v28, v28, 16, v22
-; GFX11-NEXT: v_lshl_or_b32 v22, v37, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v33
-; GFX11-NEXT: v_lshl_or_b32 v32, v32, 16, v26
-; GFX11-NEXT: v_lshl_or_b32 v26, v67, 16, v24
-; GFX11-NEXT: v_lshl_or_b32 v16, v51, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v17, v50, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v21, v38, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v23, v36, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v24, v35, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v25, v34, 16, v4
-; GFX11-NEXT: v_mov_b32_e32 v0, v32
-; GFX11-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
-; GFX11-NEXT: v_mov_b32_e32 v4, v28
-; GFX11-NEXT: v_dual_mov_b32 v6, v30 :: v_dual_mov_b32 v7, v31
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB53_4:
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr8
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr15
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: s_branch .LBB53_2
+; GFX11-TRUE16-LABEL: bitcast_v13f64_to_v52f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB53_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB53_3
+; GFX11-TRUE16-NEXT: .LBB53_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-TRUE16-NEXT: .LBB53_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v26.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB53_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT: s_branch .LBB53_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v13f64_to_v52f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v27, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v25, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v23, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v21, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v19, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s24 :: v_dual_mov_b32 v14, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s26 :: v_dual_mov_b32 v12, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB53_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB53_3
+; GFX11-FAKE16-NEXT: .LBB53_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[11:12], v[11:12], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[13:14], v[13:14], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26
+; GFX11-FAKE16-NEXT: .LBB53_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v8, 16, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v65, 16, v10
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v64, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v27
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v66, 16, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v53, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v52, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v55, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v15, 16, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v28, 16, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v26
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v67, 16, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v51, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v50, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v32
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v30 :: v_dual_mov_b32 v7, v31
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB53_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr8
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr15
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: s_branch .LBB53_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -39081,129 +40891,295 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v13f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v7
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v186, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v5 :: v_dual_mov_b32 v188, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v3 :: v_dual_mov_b32 v190, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v191, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v185, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB55_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB55_3
; GFX11-TRUE16-NEXT: .LBB55_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB55_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB55_4:
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v53, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v25, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB55_2
;
; GFX11-FAKE16-LABEL: bitcast_v52f16_to_v13f64_scalar:
@@ -41806,23 +43782,12 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v52f16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v25.h
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, 0
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
@@ -41844,26 +43809,25 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v25.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v26.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v23.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v26.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v22.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v26.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v21.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v26.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v20.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v26.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v26.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v26.h
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_4
; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v25, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v24, 16, v6
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v23, 16, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v22, 16, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43
@@ -41882,71 +43846,79 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s29, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s28, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s27, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s26, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s29, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s28, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v31, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v33, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v32, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v33
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v27
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
; GFX11-TRUE16-NEXT: s_branch .LBB57_5
; GFX11-TRUE16-NEXT: .LBB57_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
; GFX11-TRUE16-NEXT: s_branch .LBB57_2
; GFX11-TRUE16-NEXT: .LBB57_4:
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s29 :: v_dual_mov_b32 v17, s28
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s27 :: v_dual_mov_b32 v12, s26
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s24
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s23 :: v_dual_mov_b32 v8, s22
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s20
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s19 :: v_dual_mov_b32 v31, s18
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s17 :: v_dual_mov_b32 v27, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s3 :: v_dual_mov_b32 v29, s2
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s1 :: v_dual_mov_b32 v33, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s45 :: v_dual_mov_b32 v35, s44
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, s43 :: v_dual_mov_b32 v37, s42
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s40 :: v_dual_mov_b32 v39, s15
@@ -41957,62 +43929,46 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, s5 :: v_dual_mov_b32 v65, s7
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v66, s4 :: v_dual_mov_b32 v67, s41
; GFX11-TRUE16-NEXT: .LBB57_5: ; %end
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v67, 16, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v65, 16, v29
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v54, 16, v67
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v30
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v64, 16, v28
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v55, 16, v69
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v53, 16, v31
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v50, 16, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v2
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v52, 16, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v66, 16, v68
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v48, 16, v15
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v36, 16, v50
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v4, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v51, 16, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v49, 16, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v39, 16, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v38, 16, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v35, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v34, 16, v36
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v37, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v22, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v4
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v33
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v28
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v30
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v26.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v52i16_to_v52f16_scalar:
@@ -44258,23 +46214,12 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v52i16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v25.h
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, 0
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
@@ -44296,26 +46241,25 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v25.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v26.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v23.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v26.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v22.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v26.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v21.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v26.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v20.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v26.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v26.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v26.h
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_4
; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v25, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v24, 16, v6
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v23, 16, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v22, 16, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43
@@ -44334,71 +46278,79 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s29 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s28 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s27 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s26 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s29 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s28 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s13 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v31, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v33, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v32, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v33
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v27
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
; GFX11-TRUE16-NEXT: s_branch .LBB59_5
; GFX11-TRUE16-NEXT: .LBB59_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26
; GFX11-TRUE16-NEXT: s_branch .LBB59_2
; GFX11-TRUE16-NEXT: .LBB59_4:
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s29 :: v_dual_mov_b32 v17, s28
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s27 :: v_dual_mov_b32 v12, s26
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s24
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s23 :: v_dual_mov_b32 v8, s22
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s20
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s19 :: v_dual_mov_b32 v31, s18
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s17 :: v_dual_mov_b32 v27, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s3 :: v_dual_mov_b32 v29, s2
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s1 :: v_dual_mov_b32 v33, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s45 :: v_dual_mov_b32 v35, s44
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, s43 :: v_dual_mov_b32 v37, s42
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s40 :: v_dual_mov_b32 v39, s15
@@ -44409,62 +46361,46 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, s5 :: v_dual_mov_b32 v65, s7
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v66, s4 :: v_dual_mov_b32 v67, s41
; GFX11-TRUE16-NEXT: .LBB59_5: ; %end
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v67, 16, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v65, 16, v29
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v54, 16, v67
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v30
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v64, 16, v28
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v55, 16, v69
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v53, 16, v31
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v50, 16, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v2
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v52, 16, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v66, 16, v68
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v48, 16, v15
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v36, 16, v50
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v4, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v51, 16, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v49, 16, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v39, 16, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v38, 16, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v35, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v34, 16, v36
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v37, 16, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v22, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v4
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v33
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v28
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v30
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v26.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v52f16_to_v52i16_scalar:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
index 8ee5b96..8eb71e9 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
@@ -6779,141 +6779,299 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v28i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v9
-; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v189, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v6 :: v_dual_mov_b32 v191, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v4 :: v_dual_mov_b32 v185, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v1 :: v_dual_mov_b32 v187, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3
; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB15_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v187
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v28
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB15_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v28 :: v_dual_mov_b32 v53, v26
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v54, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v28, v64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB15_2
;
; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v28i32_scalar:
@@ -11885,141 +12043,299 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v28i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v9
-; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v189, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v6 :: v_dual_mov_b32 v191, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v4 :: v_dual_mov_b32 v185, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v1 :: v_dual_mov_b32 v187, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_3
; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB19_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v187
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v28
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB19_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v28 :: v_dual_mov_b32 v53, v26
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v54, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v28, v64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB19_2
;
; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v28i32_scalar:
@@ -15595,191 +15911,364 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr50
; GFX9-NEXT: s_branch .LBB29_2
;
-; GFX11-LABEL: bitcast_v28f32_to_v56i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-NEXT: v_dual_mov_b32 v28, s0 :: v_dual_mov_b32 v27, s1
-; GFX11-NEXT: v_dual_mov_b32 v26, s2 :: v_dual_mov_b32 v25, s3
-; GFX11-NEXT: v_dual_mov_b32 v24, s16 :: v_dual_mov_b32 v23, s17
-; GFX11-NEXT: v_dual_mov_b32 v22, s18 :: v_dual_mov_b32 v21, s19
-; GFX11-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v19, s21
-; GFX11-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v11, s24
-; GFX11-NEXT: v_dual_mov_b32 v12, s23 :: v_dual_mov_b32 v15, s25
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v13, s27
-; GFX11-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB29_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v28
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB29_3
-; GFX11-NEXT: .LBB29_2: ; %cmp.true
-; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
-; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14
-; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v12, 1.0, v12
-; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
-; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
-; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26
-; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v28
-; GFX11-NEXT: .LBB29_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_lshl_or_b32 v29, v29, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshl_or_b32 v31, v31, 16, v27
-; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28
-; GFX11-NEXT: v_lshl_or_b32 v33, v33, 16, v25
-; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT: v_lshl_or_b32 v35, v35, 16, v23
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT: v_lshl_or_b32 v37, v37, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v11
-; GFX11-NEXT: v_lshl_or_b32 v10, v10, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v13
-; GFX11-NEXT: v_lshl_or_b32 v16, v65, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v17, v64, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT: v_lshl_or_b32 v19, v54, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v6
-; GFX11-NEXT: v_lshl_or_b32 v32, v32, 16, v26
-; GFX11-NEXT: v_lshl_or_b32 v11, v70, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v12, v69, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_lshl_or_b32 v13, v68, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v15, v66, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v18, v55, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v21, v52, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v8
-; GFX11-NEXT: v_mov_b32_e32 v5, v35
-; GFX11-NEXT: v_lshl_or_b32 v34, v34, 16, v24
-; GFX11-NEXT: v_lshl_or_b32 v24, v49, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v31
-; GFX11-NEXT: v_lshl_or_b32 v30, v30, 16, v28
-; GFX11-NEXT: v_lshl_or_b32 v28, v71, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v20, v53, 16, v2
-; GFX11-NEXT: v_dual_mov_b32 v7, v37 :: v_dual_and_b32 v2, 0xffff, v7
-; GFX11-NEXT: v_lshl_or_b32 v36, v36, 16, v22
-; GFX11-NEXT: v_lshl_or_b32 v22, v51, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v9
-; GFX11-NEXT: v_lshl_or_b32 v14, v67, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v23, v50, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v25, v48, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v26, v39, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v27, v38, 16, v4
-; GFX11-NEXT: v_mov_b32_e32 v0, v30
-; GFX11-NEXT: v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v3, v33
-; GFX11-NEXT: v_mov_b32_e32 v4, v34
-; GFX11-NEXT: v_mov_b32_e32 v6, v36
-; GFX11-NEXT: v_dual_mov_b32 v8, v28 :: v_dual_mov_b32 v9, v29
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB29_4:
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr71
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr10
-; GFX11-NEXT: ; implicit-def: $vgpr70
-; GFX11-NEXT: ; implicit-def: $vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr68
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: s_branch .LBB29_2
+; GFX11-TRUE16-LABEL: bitcast_v28f32_to_v56i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v10 :: v_dual_mov_b32 v27, v9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB29_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB29_3
+; GFX11-TRUE16-NEXT: .LBB29_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-TRUE16-NEXT: .LBB29_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v71 :: v_dual_mov_b32 v70, v70
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v69, v69 :: v_dual_mov_b32 v68, v68
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v71.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v70.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v69.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v68.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v28.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB29_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: s_branch .LBB29_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v28f32_to_v56i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s0 :: v_dual_mov_b32 v27, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s2 :: v_dual_mov_b32 v25, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s16 :: v_dual_mov_b32 v23, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s18 :: v_dual_mov_b32 v21, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v19, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v11, s24
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s23 :: v_dual_mov_b32 v15, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v13, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB29_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB29_3
+; GFX11-FAKE16-NEXT: .LBB29_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28
+; GFX11-FAKE16-NEXT: .LBB29_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v27
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v35, 16, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v37, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v65, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v64, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v54, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v26
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v70, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v69, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v68, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v66, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v55, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v52, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v35
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v34, 16, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v49, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v31
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v71, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v53, 16, v2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v37 :: v_dual_and_b32 v2, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v36, 16, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v51, 16, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v67, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v50, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v48, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v39, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v38, 16, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v30
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v3, v33
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v34
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v36
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, v28 :: v_dual_mov_b32 v9, v29
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB29_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: s_branch .LBB29_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -17915,141 +18404,299 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v28f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v9
-; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v189, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v6 :: v_dual_mov_b32 v191, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v4 :: v_dual_mov_b32 v185, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v1 :: v_dual_mov_b32 v187, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB31_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB31_3
; GFX11-TRUE16-NEXT: .LBB31_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB31_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v187
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v28
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB31_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v28 :: v_dual_mov_b32 v53, v26
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v54, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v28, v64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB31_2
;
; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v28f32_scalar:
@@ -20379,191 +21026,364 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr50
; GFX9-NEXT: s_branch .LBB33_2
;
-; GFX11-LABEL: bitcast_v28f32_to_v56f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-NEXT: v_dual_mov_b32 v28, s0 :: v_dual_mov_b32 v27, s1
-; GFX11-NEXT: v_dual_mov_b32 v26, s2 :: v_dual_mov_b32 v25, s3
-; GFX11-NEXT: v_dual_mov_b32 v24, s16 :: v_dual_mov_b32 v23, s17
-; GFX11-NEXT: v_dual_mov_b32 v22, s18 :: v_dual_mov_b32 v21, s19
-; GFX11-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v19, s21
-; GFX11-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v11, s24
-; GFX11-NEXT: v_dual_mov_b32 v12, s23 :: v_dual_mov_b32 v15, s25
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v13, s27
-; GFX11-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB33_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v28
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB33_3
-; GFX11-NEXT: .LBB33_2: ; %cmp.true
-; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
-; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14
-; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v12, 1.0, v12
-; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
-; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
-; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26
-; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v28
-; GFX11-NEXT: .LBB33_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_lshl_or_b32 v29, v29, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshl_or_b32 v31, v31, 16, v27
-; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28
-; GFX11-NEXT: v_lshl_or_b32 v33, v33, 16, v25
-; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT: v_lshl_or_b32 v35, v35, 16, v23
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT: v_lshl_or_b32 v37, v37, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v11
-; GFX11-NEXT: v_lshl_or_b32 v10, v10, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v13
-; GFX11-NEXT: v_lshl_or_b32 v16, v65, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v17, v64, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT: v_lshl_or_b32 v19, v54, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v6
-; GFX11-NEXT: v_lshl_or_b32 v32, v32, 16, v26
-; GFX11-NEXT: v_lshl_or_b32 v11, v70, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v12, v69, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_lshl_or_b32 v13, v68, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v15, v66, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v18, v55, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v21, v52, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v8
-; GFX11-NEXT: v_mov_b32_e32 v5, v35
-; GFX11-NEXT: v_lshl_or_b32 v34, v34, 16, v24
-; GFX11-NEXT: v_lshl_or_b32 v24, v49, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v31
-; GFX11-NEXT: v_lshl_or_b32 v30, v30, 16, v28
-; GFX11-NEXT: v_lshl_or_b32 v28, v71, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v20, v53, 16, v2
-; GFX11-NEXT: v_dual_mov_b32 v7, v37 :: v_dual_and_b32 v2, 0xffff, v7
-; GFX11-NEXT: v_lshl_or_b32 v36, v36, 16, v22
-; GFX11-NEXT: v_lshl_or_b32 v22, v51, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v9
-; GFX11-NEXT: v_lshl_or_b32 v14, v67, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v23, v50, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v25, v48, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v26, v39, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v27, v38, 16, v4
-; GFX11-NEXT: v_mov_b32_e32 v0, v30
-; GFX11-NEXT: v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v3, v33
-; GFX11-NEXT: v_mov_b32_e32 v4, v34
-; GFX11-NEXT: v_mov_b32_e32 v6, v36
-; GFX11-NEXT: v_dual_mov_b32 v8, v28 :: v_dual_mov_b32 v9, v29
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB33_4:
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr71
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr10
-; GFX11-NEXT: ; implicit-def: $vgpr70
-; GFX11-NEXT: ; implicit-def: $vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr68
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: s_branch .LBB33_2
+; GFX11-TRUE16-LABEL: bitcast_v28f32_to_v56f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v10 :: v_dual_mov_b32 v27, v9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB33_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB33_3
+; GFX11-TRUE16-NEXT: .LBB33_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-TRUE16-NEXT: .LBB33_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v71 :: v_dual_mov_b32 v70, v70
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v69, v69 :: v_dual_mov_b32 v68, v68
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v71.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v70.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v69.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v68.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v28.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB33_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: s_branch .LBB33_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v28f32_to_v56f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s0 :: v_dual_mov_b32 v27, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s2 :: v_dual_mov_b32 v25, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s16 :: v_dual_mov_b32 v23, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s18 :: v_dual_mov_b32 v21, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v19, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v11, s24
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s23 :: v_dual_mov_b32 v15, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v13, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB33_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB33_3
+; GFX11-FAKE16-NEXT: .LBB33_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28
+; GFX11-FAKE16-NEXT: .LBB33_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v27
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v35, 16, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v37, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v65, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v64, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v54, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v26
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v70, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v69, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v68, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v66, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v55, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v52, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v35
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v34, 16, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v49, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v31
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v71, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v53, 16, v2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v37 :: v_dual_and_b32 v2, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v36, 16, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v51, 16, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v67, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v50, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v48, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v39, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v38, 16, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v30
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v3, v33
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v34
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v36
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, v28 :: v_dual_mov_b32 v9, v29
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB33_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: s_branch .LBB33_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -23006,141 +23826,299 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v28f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v9
-; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v189, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v6 :: v_dual_mov_b32 v191, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v4 :: v_dual_mov_b32 v185, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v1 :: v_dual_mov_b32 v187, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_3
; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB35_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v187
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v28
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB35_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v28 :: v_dual_mov_b32 v53, v26
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v54, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v28, v64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB35_2
;
; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v28f32_scalar:
@@ -28216,141 +29194,299 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v14i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v9
-; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v189, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v6 :: v_dual_mov_b32 v191, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v4 :: v_dual_mov_b32 v185, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v1 :: v_dual_mov_b32 v187, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_3
; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB43_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v187
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v28
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB43_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v28 :: v_dual_mov_b32 v53, v26
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v54, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v28, v64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB43_2
;
; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v14i64_scalar:
@@ -33336,141 +34472,299 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v14i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v9
-; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v189, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v6 :: v_dual_mov_b32 v191, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v4 :: v_dual_mov_b32 v185, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v1 :: v_dual_mov_b32 v187, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_3
; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB47_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v187
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v28
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB47_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v28 :: v_dual_mov_b32 v53, v26
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v54, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v28, v64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB47_2
;
; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v14i64_scalar:
@@ -35225,191 +36519,364 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr27
; GFX9-NEXT: s_branch .LBB49_2
;
-; GFX11-LABEL: bitcast_v14f64_to_v56i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-NEXT: v_dual_mov_b32 v27, s0 :: v_dual_mov_b32 v28, s1
-; GFX11-NEXT: v_dual_mov_b32 v25, s2 :: v_dual_mov_b32 v26, s3
-; GFX11-NEXT: v_dual_mov_b32 v23, s16 :: v_dual_mov_b32 v24, s17
-; GFX11-NEXT: v_dual_mov_b32 v21, s18 :: v_dual_mov_b32 v22, s19
-; GFX11-NEXT: v_dual_mov_b32 v19, s20 :: v_dual_mov_b32 v20, s21
-; GFX11-NEXT: v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23
-; GFX11-NEXT: v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v18, s25
-; GFX11-NEXT: v_dual_mov_b32 v13, s26 :: v_dual_mov_b32 v14, s27
-; GFX11-NEXT: v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v16, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB49_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v27
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB49_3
-; GFX11-NEXT: .LBB49_2: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
-; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
-; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
-; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT: v_add_f64 v[15:16], v[15:16], 1.0
-; GFX11-NEXT: v_add_f64 v[13:14], v[13:14], 1.0
-; GFX11-NEXT: v_add_f64 v[17:18], v[17:18], 1.0
-; GFX11-NEXT: v_add_f64 v[11:12], v[11:12], 1.0
-; GFX11-NEXT: v_add_f64 v[19:20], v[19:20], 1.0
-; GFX11-NEXT: v_add_f64 v[21:22], v[21:22], 1.0
-; GFX11-NEXT: v_add_f64 v[23:24], v[23:24], 1.0
-; GFX11-NEXT: v_add_f64 v[25:26], v[25:26], 1.0
-; GFX11-NEXT: v_add_f64 v[27:28], v[27:28], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v27
-; GFX11-NEXT: .LBB49_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_lshl_or_b32 v31, v31, 16, v28
-; GFX11-NEXT: v_lshl_or_b32 v37, v37, 16, v22
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_lshl_or_b32 v28, v71, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v14
-; GFX11-NEXT: v_lshl_or_b32 v29, v29, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT: v_lshl_or_b32 v36, v36, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT: v_lshl_or_b32 v32, v32, 16, v25
-; GFX11-NEXT: v_lshl_or_b32 v10, v10, 16, v11
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v16
-; GFX11-NEXT: v_lshl_or_b32 v15, v66, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v19, v54, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v22, v51, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v6
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v9
-; GFX11-NEXT: v_mov_b32_e32 v6, v36
-; GFX11-NEXT: v_lshl_or_b32 v34, v34, 16, v23
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v26
-; GFX11-NEXT: v_mov_b32_e32 v9, v29
-; GFX11-NEXT: v_lshl_or_b32 v11, v70, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v12, v69, 16, v17
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v18
-; GFX11-NEXT: v_lshl_or_b32 v30, v30, 16, v27
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v13
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v13, v68, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v17, v64, 16, v21
-; GFX11-NEXT: v_lshl_or_b32 v21, v52, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v8
-; GFX11-NEXT: v_lshl_or_b32 v27, v38, 16, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v34
-; GFX11-NEXT: v_lshl_or_b32 v33, v33, 16, v25
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT: v_lshl_or_b32 v14, v67, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v16, v65, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v18, v55, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v20, v53, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v7
-; GFX11-NEXT: v_lshl_or_b32 v26, v39, 16, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v33
-; GFX11-NEXT: v_lshl_or_b32 v35, v35, 16, v24
-; GFX11-NEXT: v_lshl_or_b32 v23, v50, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v24, v49, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v25, v48, 16, v2
-; GFX11-NEXT: v_dual_mov_b32 v0, v30 :: v_dual_mov_b32 v1, v31
-; GFX11-NEXT: v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v5, v35
-; GFX11-NEXT: v_dual_mov_b32 v7, v37 :: v_dual_mov_b32 v8, v28
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB49_4:
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr71
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr10
-; GFX11-NEXT: ; implicit-def: $vgpr70
-; GFX11-NEXT: ; implicit-def: $vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr68
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: s_branch .LBB49_2
+; GFX11-TRUE16-LABEL: bitcast_v14f64_to_v56i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v10 :: v_dual_mov_b32 v27, v9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_3
+; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-TRUE16-NEXT: .LBB49_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v71 :: v_dual_mov_b32 v70, v70
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v69, v69 :: v_dual_mov_b32 v68, v68
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v71.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v70.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v69.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v68.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v28.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB49_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: s_branch .LBB49_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v14f64_to_v56i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, s0 :: v_dual_mov_b32 v28, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, s2 :: v_dual_mov_b32 v26, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, s16 :: v_dual_mov_b32 v24, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, s18 :: v_dual_mov_b32 v22, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, s20 :: v_dual_mov_b32 v20, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v18, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s26 :: v_dual_mov_b32 v14, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v16, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v27
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_3
+; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[15:16], v[15:16], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[13:14], v[13:14], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[17:18], v[17:18], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[11:12], v[11:12], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[19:20], v[19:20], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[21:22], v[21:22], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[23:24], v[23:24], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[25:26], v[25:26], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[27:28], v[27:28], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v27
+; GFX11-FAKE16-NEXT: .LBB49_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v37, 16, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v71, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v36, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v25
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v66, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v51, 16, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v36
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v34, 16, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v26
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, v29
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v70, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v69, 16, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v27
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v68, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v64, 16, v21
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v52, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v38, 16, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v34
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v67, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v65, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v55, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v53, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v39, 16, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v35, 16, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v50, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v49, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v48, 16, v2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v30 :: v_dual_mov_b32 v1, v31
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v5, v35
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v37 :: v_dual_mov_b32 v8, v28
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB49_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: s_branch .LBB49_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -37545,141 +39012,299 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v14f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v9
-; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v189, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v6 :: v_dual_mov_b32 v191, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v4 :: v_dual_mov_b32 v185, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v1 :: v_dual_mov_b32 v187, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3
; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB51_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v187
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v28
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB51_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v28 :: v_dual_mov_b32 v53, v26
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v54, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v28, v64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB51_2
;
; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v14f64_scalar:
@@ -39918,191 +41543,364 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
; GFX9-NEXT: ; implicit-def: $vgpr27
; GFX9-NEXT: s_branch .LBB53_2
;
-; GFX11-LABEL: bitcast_v14f64_to_v56f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-NEXT: v_dual_mov_b32 v27, s0 :: v_dual_mov_b32 v28, s1
-; GFX11-NEXT: v_dual_mov_b32 v25, s2 :: v_dual_mov_b32 v26, s3
-; GFX11-NEXT: v_dual_mov_b32 v23, s16 :: v_dual_mov_b32 v24, s17
-; GFX11-NEXT: v_dual_mov_b32 v21, s18 :: v_dual_mov_b32 v22, s19
-; GFX11-NEXT: v_dual_mov_b32 v19, s20 :: v_dual_mov_b32 v20, s21
-; GFX11-NEXT: v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23
-; GFX11-NEXT: v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v18, s25
-; GFX11-NEXT: v_dual_mov_b32 v13, s26 :: v_dual_mov_b32 v14, s27
-; GFX11-NEXT: v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v16, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB53_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v27
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB53_3
-; GFX11-NEXT: .LBB53_2: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
-; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
-; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
-; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT: v_add_f64 v[15:16], v[15:16], 1.0
-; GFX11-NEXT: v_add_f64 v[13:14], v[13:14], 1.0
-; GFX11-NEXT: v_add_f64 v[17:18], v[17:18], 1.0
-; GFX11-NEXT: v_add_f64 v[11:12], v[11:12], 1.0
-; GFX11-NEXT: v_add_f64 v[19:20], v[19:20], 1.0
-; GFX11-NEXT: v_add_f64 v[21:22], v[21:22], 1.0
-; GFX11-NEXT: v_add_f64 v[23:24], v[23:24], 1.0
-; GFX11-NEXT: v_add_f64 v[25:26], v[25:26], 1.0
-; GFX11-NEXT: v_add_f64 v[27:28], v[27:28], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v27
-; GFX11-NEXT: .LBB53_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT: v_lshl_or_b32 v31, v31, 16, v28
-; GFX11-NEXT: v_lshl_or_b32 v37, v37, 16, v22
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_lshl_or_b32 v28, v71, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v14
-; GFX11-NEXT: v_lshl_or_b32 v29, v29, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT: v_lshl_or_b32 v36, v36, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT: v_lshl_or_b32 v32, v32, 16, v25
-; GFX11-NEXT: v_lshl_or_b32 v10, v10, 16, v11
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v16
-; GFX11-NEXT: v_lshl_or_b32 v15, v66, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v19, v54, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v22, v51, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v6
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v9
-; GFX11-NEXT: v_mov_b32_e32 v6, v36
-; GFX11-NEXT: v_lshl_or_b32 v34, v34, 16, v23
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v26
-; GFX11-NEXT: v_mov_b32_e32 v9, v29
-; GFX11-NEXT: v_lshl_or_b32 v11, v70, 16, v12
-; GFX11-NEXT: v_lshl_or_b32 v12, v69, 16, v17
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v18
-; GFX11-NEXT: v_lshl_or_b32 v30, v30, 16, v27
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v13
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v13, v68, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v17, v64, 16, v21
-; GFX11-NEXT: v_lshl_or_b32 v21, v52, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v8
-; GFX11-NEXT: v_lshl_or_b32 v27, v38, 16, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v34
-; GFX11-NEXT: v_lshl_or_b32 v33, v33, 16, v25
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT: v_lshl_or_b32 v14, v67, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v16, v65, 16, v20
-; GFX11-NEXT: v_lshl_or_b32 v18, v55, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v20, v53, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v7
-; GFX11-NEXT: v_lshl_or_b32 v26, v39, 16, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v33
-; GFX11-NEXT: v_lshl_or_b32 v35, v35, 16, v24
-; GFX11-NEXT: v_lshl_or_b32 v23, v50, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v24, v49, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v25, v48, 16, v2
-; GFX11-NEXT: v_dual_mov_b32 v0, v30 :: v_dual_mov_b32 v1, v31
-; GFX11-NEXT: v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v5, v35
-; GFX11-NEXT: v_dual_mov_b32 v7, v37 :: v_dual_mov_b32 v8, v28
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB53_4:
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr71
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr10
-; GFX11-NEXT: ; implicit-def: $vgpr70
-; GFX11-NEXT: ; implicit-def: $vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr68
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: s_branch .LBB53_2
+; GFX11-TRUE16-LABEL: bitcast_v14f64_to_v56f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v10 :: v_dual_mov_b32 v27, v9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB53_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB53_3
+; GFX11-TRUE16-NEXT: .LBB53_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-TRUE16-NEXT: .LBB53_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v71 :: v_dual_mov_b32 v70, v70
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v69, v69 :: v_dual_mov_b32 v68, v68
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v71.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v70.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v69.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v68.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v28.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB53_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT: s_branch .LBB53_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v14f64_to_v56f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, s0 :: v_dual_mov_b32 v28, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, s2 :: v_dual_mov_b32 v26, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, s16 :: v_dual_mov_b32 v24, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, s18 :: v_dual_mov_b32 v22, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, s20 :: v_dual_mov_b32 v20, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v18, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s26 :: v_dual_mov_b32 v14, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v16, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB53_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v27
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB53_3
+; GFX11-FAKE16-NEXT: .LBB53_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[15:16], v[15:16], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[13:14], v[13:14], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[17:18], v[17:18], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[11:12], v[11:12], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[19:20], v[19:20], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[21:22], v[21:22], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[23:24], v[23:24], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[25:26], v[25:26], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[27:28], v[27:28], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v27
+; GFX11-FAKE16-NEXT: .LBB53_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v37, 16, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v71, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v36, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v25
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v66, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v51, 16, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v36
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v34, 16, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v26
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, v29
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v70, 16, v12
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v69, 16, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v27
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v68, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v64, 16, v21
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v52, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v38, 16, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v34
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v67, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v65, 16, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v55, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v53, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v39, 16, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v35, 16, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v50, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v49, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v48, 16, v2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v30 :: v_dual_mov_b32 v1, v31
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v5, v35
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v37 :: v_dual_mov_b32 v8, v28
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB53_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: s_branch .LBB53_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -42545,141 +44343,299 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v14f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v9
-; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v189, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v6 :: v_dual_mov_b32 v191, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v4 :: v_dual_mov_b32 v185, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v1 :: v_dual_mov_b32 v187, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB55_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB55_3
; GFX11-TRUE16-NEXT: .LBB55_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB55_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v187
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v28
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB55_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v28 :: v_dual_mov_b32 v53, v26
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v54, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v28, v64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB55_2
;
; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v14f64_scalar:
@@ -45566,27 +47522,13 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v56f16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v27.h
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, 0
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
@@ -45608,30 +47550,29 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v27.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v26.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v25.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v24.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v23.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v22.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v21.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v20.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v28.h
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_4
; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v27, 16, v9
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v26, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v25, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v24, 16, v6
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v23, 16, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v22, 16, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s42
@@ -45650,75 +47591,85 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s29, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s28, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s29, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s28, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s26, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s25, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v33, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v34, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v32, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v31, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v37, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v36, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v37
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v36
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v35
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v34
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v33
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27
; GFX11-TRUE16-NEXT: s_branch .LBB57_5
; GFX11-TRUE16-NEXT: .LBB57_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
; GFX11-TRUE16-NEXT: s_branch .LBB57_2
; GFX11-TRUE16-NEXT: .LBB57_4:
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s29 :: v_dual_mov_b32 v14, s28
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s26
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s25 :: v_dual_mov_b32 v10, s24
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s22
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s21 :: v_dual_mov_b32 v29, s20
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, s19 :: v_dual_mov_b32 v34, s18
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, s17 :: v_dual_mov_b32 v36, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, s3 :: v_dual_mov_b32 v30, s2
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, s1 :: v_dual_mov_b32 v32, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s45 :: v_dual_mov_b32 v39, s44
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, s42 :: v_dual_mov_b32 v49, s41
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s40 :: v_dual_mov_b32 v51, s15
@@ -45729,69 +47680,49 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v68, s5 :: v_dual_mov_b32 v69, s7
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v70, s4 :: v_dual_mov_b32 v71, s43
; GFX11-TRUE16-NEXT: .LBB57_5: ; %end
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v37
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v70, 16, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v68, 16, v37
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v80, 0xffff, v30
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v35, v66, 16, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v31
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v71, 16, v32
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v69, 16, v80
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v64, 16, v70
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v35 :: v_dual_and_b32 v0, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v67, 16, v36
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v36, v65, 16, v69
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v55, 16, v29
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v54, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v53, 16, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v52, 16, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v51, 16, v65
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v22, 16, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v50, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v49, 16, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v48, 16, v15
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v39, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v38, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v27, 16, v4
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v30
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v32
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v34
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v36 :: v_dual_mov_b32 v7, v37
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v28 :: v_dual_mov_b32 v9, v29
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v71 :: v_dual_mov_b32 v70, v70
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v69, v69 :: v_dual_mov_b32 v68, v68
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v71.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v70.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v69.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v68.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v28.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v56f16_scalar:
@@ -48280,27 +50211,13 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v56i16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v27.h
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, 0
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
@@ -48322,30 +50239,29 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v27.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v26.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v25.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v24.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v23.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v22.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v21.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v20.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v28.h
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_4
; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v27, 16, v9
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v26, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v25, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v24, 16, v6
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v23, 16, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v22, 16, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s42
@@ -48364,75 +50280,85 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s29 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s28 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s29 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s28 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s26 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s25 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v33, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v34, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v32, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v31, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v37, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v36, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v37
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v36
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v35
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v34
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v33
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27
; GFX11-TRUE16-NEXT: s_branch .LBB59_5
; GFX11-TRUE16-NEXT: .LBB59_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28
; GFX11-TRUE16-NEXT: s_branch .LBB59_2
; GFX11-TRUE16-NEXT: .LBB59_4:
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s29 :: v_dual_mov_b32 v14, s28
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s26
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s25 :: v_dual_mov_b32 v10, s24
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s22
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s21 :: v_dual_mov_b32 v29, s20
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, s19 :: v_dual_mov_b32 v34, s18
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, s17 :: v_dual_mov_b32 v36, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, s3 :: v_dual_mov_b32 v30, s2
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, s1 :: v_dual_mov_b32 v32, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s45 :: v_dual_mov_b32 v39, s44
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, s42 :: v_dual_mov_b32 v49, s41
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s40 :: v_dual_mov_b32 v51, s15
@@ -48443,69 +50369,49 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v68, s5 :: v_dual_mov_b32 v69, s7
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v70, s4 :: v_dual_mov_b32 v71, s43
; GFX11-TRUE16-NEXT: .LBB59_5: ; %end
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v37
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v70, 16, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v68, 16, v37
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v80, 0xffff, v30
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v35, v66, 16, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v31
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v71, 16, v32
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v69, 16, v80
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v64, 16, v70
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v35 :: v_dual_and_b32 v0, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v67, 16, v36
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v36, v65, 16, v69
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v55, 16, v29
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v54, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v53, 16, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v52, 16, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v51, 16, v65
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v22, 16, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v50, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v49, 16, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v48, 16, v15
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v39, 16, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v38, 16, v52
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v27, 16, v4
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v30
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v32
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v34
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v36 :: v_dual_mov_b32 v7, v37
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v28 :: v_dual_mov_b32 v9, v29
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v71 :: v_dual_mov_b32 v70, v70
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v69, v69 :: v_dual_mov_b32 v68, v68
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v71.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v70.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v69.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v68.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v28.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v56i16_scalar:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
index 967f1a9..93c11f1 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
@@ -7240,153 +7240,305 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v30i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v11
-; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v191, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v6 :: v_dual_mov_b32 v185, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v3 :: v_dual_mov_b32 v187, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v1 :: v_dual_mov_b32 v189, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3
; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB15_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v189
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v30
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB15_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v29 :: v_dual_mov_b32 v65, v28
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v66, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v26 :: v_dual_mov_b32 v54, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v65 :: v_dual_mov_b32 v29, v64
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v30, v66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB15_2
;
; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v30i32_scalar:
@@ -12840,153 +12992,305 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v30i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v11
-; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v191, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v6 :: v_dual_mov_b32 v185, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v3 :: v_dual_mov_b32 v187, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v1 :: v_dual_mov_b32 v189, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_3
; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB19_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v189
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v30
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB19_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v29 :: v_dual_mov_b32 v65, v28
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v66, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v26 :: v_dual_mov_b32 v54, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v65 :: v_dual_mov_b32 v29, v64
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v30, v66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB19_2
;
; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v30i32_scalar:
@@ -16802,204 +17106,388 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr54
; GFX9-NEXT: s_branch .LBB29_2
;
-; GFX11-LABEL: bitcast_v30f32_to_v60i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-NEXT: v_dual_mov_b32 v30, s0 :: v_dual_mov_b32 v29, s1
-; GFX11-NEXT: v_dual_mov_b32 v28, s2 :: v_dual_mov_b32 v27, s3
-; GFX11-NEXT: v_dual_mov_b32 v26, s16 :: v_dual_mov_b32 v25, s17
-; GFX11-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v23, s19
-; GFX11-NEXT: v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v21, s21
-; GFX11-NEXT: v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v19, s23
-; GFX11-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v13, s26
-; GFX11-NEXT: v_dual_mov_b32 v14, s25 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB29_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v30
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB29_3
-; GFX11-NEXT: .LBB29_2: ; %cmp.true
-; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
-; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
-; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
-; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
-; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26
-; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28
-; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v30, 1.0, v30
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v30
-; GFX11-NEXT: .LBB29_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29
-; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT: v_lshl_or_b32 v33, v33, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v13
-; GFX11-NEXT: v_lshl_or_b32 v35, v35, 16, v19
-; GFX11-NEXT: v_lshl_or_b32 v12, v12, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v19, v68, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v37, v37, 16, v29
-; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30
-; GFX11-NEXT: v_lshl_or_b32 v39, v39, 16, v27
-; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_lshl_or_b32 v49, v49, 16, v25
-; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT: v_lshl_or_b32 v31, v31, 16, v23
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_lshl_or_b32 v13, v82, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v14, v81, 16, v21
-; GFX11-NEXT: v_lshl_or_b32 v16, v71, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v17, v70, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v18, v69, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v6
-; GFX11-NEXT: v_lshl_or_b32 v21, v66, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v8
-; GFX11-NEXT: v_lshl_or_b32 v38, v38, 16, v28
-; GFX11-NEXT: v_lshl_or_b32 v32, v32, 16, v22
-; GFX11-NEXT: v_lshl_or_b32 v34, v34, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_lshl_or_b32 v20, v67, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v22, v65, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v23, v64, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v7
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v9
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v10
-; GFX11-NEXT: v_mov_b32_e32 v5, v49
-; GFX11-NEXT: v_lshl_or_b32 v48, v48, 16, v26
-; GFX11-NEXT: v_lshl_or_b32 v26, v53, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v37
-; GFX11-NEXT: v_lshl_or_b32 v36, v36, 16, v30
-; GFX11-NEXT: v_mov_b32_e32 v7, v31
-; GFX11-NEXT: v_lshl_or_b32 v30, v83, 16, v24
-; GFX11-NEXT: v_lshl_or_b32 v24, v55, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v11
-; GFX11-NEXT: v_lshl_or_b32 v15, v80, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v25, v54, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v27, v52, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v28, v51, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v29, v50, 16, v4
-; GFX11-NEXT: v_mov_b32_e32 v0, v36
-; GFX11-NEXT: v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39
-; GFX11-NEXT: v_mov_b32_e32 v4, v48
-; GFX11-NEXT: v_mov_b32_e32 v6, v30
-; GFX11-NEXT: v_dual_mov_b32 v8, v32 :: v_dual_mov_b32 v9, v33
-; GFX11-NEXT: v_dual_mov_b32 v10, v34 :: v_dual_mov_b32 v11, v35
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB29_4:
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr83
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr12
-; GFX11-NEXT: ; implicit-def: $vgpr82
-; GFX11-NEXT: ; implicit-def: $vgpr81
-; GFX11-NEXT: ; implicit-def: $vgpr80
-; GFX11-NEXT: ; implicit-def: $vgpr71
-; GFX11-NEXT: ; implicit-def: $vgpr70
-; GFX11-NEXT: ; implicit-def: $vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr68
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: s_branch .LBB29_2
+; GFX11-TRUE16-LABEL: bitcast_v30f32_to_v60i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB29_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB29_3
+; GFX11-TRUE16-NEXT: .LBB29_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-TRUE16-NEXT: .LBB29_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v83, v83 :: v_dual_mov_b32 v82, v82
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v81, v81 :: v_dual_mov_b32 v80, v80
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v71 :: v_dual_mov_b32 v70, v70
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v69, v69 :: v_dual_mov_b32 v68, v68
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v83.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v82.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v81.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v80.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v71.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v70.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v69.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v68.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v30.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB29_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: s_branch .LBB29_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v30f32_to_v60i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s0 :: v_dual_mov_b32 v29, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s2 :: v_dual_mov_b32 v27, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s16 :: v_dual_mov_b32 v25, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v23, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v21, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v19, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v13, s26
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s25 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB29_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB29_3
+; GFX11-FAKE16-NEXT: .LBB29_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v30, 1.0, v30
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30
+; GFX11-FAKE16-NEXT: .LBB29_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v35, 16, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v12, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v68, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v37, 16, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v39, v39, 16, v27
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v49, v49, 16, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v82, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v81, 16, v21
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v71, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v70, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v69, 16, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v66, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v38, v38, 16, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v34, 16, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v67, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v65, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v64, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v49
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v48, v48, 16, v26
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v53, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v37
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v36, 16, v30
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v31
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v83, 16, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v55, 16, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v80, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v54, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v52, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v51, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v50, 16, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v36
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v48
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v30
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, v32 :: v_dual_mov_b32 v9, v33
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, v34 :: v_dual_mov_b32 v11, v35
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB29_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr12
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: s_branch .LBB29_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -19290,153 +19778,305 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v30f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v11
-; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v191, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v6 :: v_dual_mov_b32 v185, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v3 :: v_dual_mov_b32 v187, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v1 :: v_dual_mov_b32 v189, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB31_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB31_3
; GFX11-TRUE16-NEXT: .LBB31_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB31_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v189
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v30
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB31_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v29 :: v_dual_mov_b32 v65, v28
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v66, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v26 :: v_dual_mov_b32 v54, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v65 :: v_dual_mov_b32 v29, v64
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v30, v66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB31_2
;
; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v30f32_scalar:
@@ -21985,204 +22625,388 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr54
; GFX9-NEXT: s_branch .LBB33_2
;
-; GFX11-LABEL: bitcast_v30f32_to_v60f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-NEXT: v_dual_mov_b32 v30, s0 :: v_dual_mov_b32 v29, s1
-; GFX11-NEXT: v_dual_mov_b32 v28, s2 :: v_dual_mov_b32 v27, s3
-; GFX11-NEXT: v_dual_mov_b32 v26, s16 :: v_dual_mov_b32 v25, s17
-; GFX11-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v23, s19
-; GFX11-NEXT: v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v21, s21
-; GFX11-NEXT: v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v19, s23
-; GFX11-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v13, s26
-; GFX11-NEXT: v_dual_mov_b32 v14, s25 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB33_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v30
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB33_3
-; GFX11-NEXT: .LBB33_2: ; %cmp.true
-; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
-; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
-; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
-; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
-; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26
-; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28
-; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v30, 1.0, v30
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v30
-; GFX11-NEXT: .LBB33_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29
-; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT: v_lshl_or_b32 v33, v33, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v13
-; GFX11-NEXT: v_lshl_or_b32 v35, v35, 16, v19
-; GFX11-NEXT: v_lshl_or_b32 v12, v12, 16, v18
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v19, v68, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v37, v37, 16, v29
-; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30
-; GFX11-NEXT: v_lshl_or_b32 v39, v39, 16, v27
-; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_lshl_or_b32 v49, v49, 16, v25
-; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT: v_lshl_or_b32 v31, v31, 16, v23
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_lshl_or_b32 v13, v82, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v14, v81, 16, v21
-; GFX11-NEXT: v_lshl_or_b32 v16, v71, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v17, v70, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v18, v69, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v6
-; GFX11-NEXT: v_lshl_or_b32 v21, v66, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v8
-; GFX11-NEXT: v_lshl_or_b32 v38, v38, 16, v28
-; GFX11-NEXT: v_lshl_or_b32 v32, v32, 16, v22
-; GFX11-NEXT: v_lshl_or_b32 v34, v34, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_lshl_or_b32 v20, v67, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v22, v65, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v23, v64, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v7
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v9
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v10
-; GFX11-NEXT: v_mov_b32_e32 v5, v49
-; GFX11-NEXT: v_lshl_or_b32 v48, v48, 16, v26
-; GFX11-NEXT: v_lshl_or_b32 v26, v53, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v37
-; GFX11-NEXT: v_lshl_or_b32 v36, v36, 16, v30
-; GFX11-NEXT: v_mov_b32_e32 v7, v31
-; GFX11-NEXT: v_lshl_or_b32 v30, v83, 16, v24
-; GFX11-NEXT: v_lshl_or_b32 v24, v55, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v11
-; GFX11-NEXT: v_lshl_or_b32 v15, v80, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v25, v54, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v27, v52, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v28, v51, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v29, v50, 16, v4
-; GFX11-NEXT: v_mov_b32_e32 v0, v36
-; GFX11-NEXT: v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39
-; GFX11-NEXT: v_mov_b32_e32 v4, v48
-; GFX11-NEXT: v_mov_b32_e32 v6, v30
-; GFX11-NEXT: v_dual_mov_b32 v8, v32 :: v_dual_mov_b32 v9, v33
-; GFX11-NEXT: v_dual_mov_b32 v10, v34 :: v_dual_mov_b32 v11, v35
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB33_4:
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr83
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr12
-; GFX11-NEXT: ; implicit-def: $vgpr82
-; GFX11-NEXT: ; implicit-def: $vgpr81
-; GFX11-NEXT: ; implicit-def: $vgpr80
-; GFX11-NEXT: ; implicit-def: $vgpr71
-; GFX11-NEXT: ; implicit-def: $vgpr70
-; GFX11-NEXT: ; implicit-def: $vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr68
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: s_branch .LBB33_2
+; GFX11-TRUE16-LABEL: bitcast_v30f32_to_v60f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB33_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB33_3
+; GFX11-TRUE16-NEXT: .LBB33_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-TRUE16-NEXT: .LBB33_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v83, v83 :: v_dual_mov_b32 v82, v82
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v81, v81 :: v_dual_mov_b32 v80, v80
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v71 :: v_dual_mov_b32 v70, v70
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v69, v69 :: v_dual_mov_b32 v68, v68
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v83.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v82.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v81.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v80.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v71.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v70.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v69.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v68.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v30.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB33_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: s_branch .LBB33_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v30f32_to_v60f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s0 :: v_dual_mov_b32 v29, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s2 :: v_dual_mov_b32 v27, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s16 :: v_dual_mov_b32 v25, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v23, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v21, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v19, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v13, s26
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s25 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB33_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB33_3
+; GFX11-FAKE16-NEXT: .LBB33_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v30, 1.0, v30
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30
+; GFX11-FAKE16-NEXT: .LBB33_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v35, 16, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v12, 16, v18
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v68, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v37, 16, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v39, v39, 16, v27
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v49, v49, 16, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v82, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v81, 16, v21
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v71, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v70, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v69, 16, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v66, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v38, v38, 16, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v34, 16, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v67, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v65, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v64, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v49
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v48, v48, 16, v26
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v53, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v37
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v36, 16, v30
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v31
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v83, 16, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v55, 16, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v80, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v54, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v52, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v51, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v50, 16, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v36
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v48
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v30
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, v32 :: v_dual_mov_b32 v9, v33
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, v34 :: v_dual_mov_b32 v11, v35
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB33_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr12
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: s_branch .LBB33_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -24867,153 +25691,305 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v30f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v11
-; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v191, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v6 :: v_dual_mov_b32 v185, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v3 :: v_dual_mov_b32 v187, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v1 :: v_dual_mov_b32 v189, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_3
; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB35_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v189
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v30
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB35_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v29 :: v_dual_mov_b32 v65, v28
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v66, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v26 :: v_dual_mov_b32 v54, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v65 :: v_dual_mov_b32 v29, v64
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v30, v66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB35_2
;
; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v30f32_scalar:
@@ -30472,153 +31448,305 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v15i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v11
-; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v191, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v6 :: v_dual_mov_b32 v185, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v3 :: v_dual_mov_b32 v187, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v1 :: v_dual_mov_b32 v189, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_3
; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB43_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v189
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v30
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB43_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v29 :: v_dual_mov_b32 v65, v28
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v66, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v26 :: v_dual_mov_b32 v54, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v65 :: v_dual_mov_b32 v29, v64
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v30, v66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB43_2
;
; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v15i64_scalar:
@@ -36089,153 +37217,305 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v15i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v11
-; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v191, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v6 :: v_dual_mov_b32 v185, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v3 :: v_dual_mov_b32 v187, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v1 :: v_dual_mov_b32 v189, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_3
; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB47_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v189
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v30
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB47_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v29 :: v_dual_mov_b32 v65, v28
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v66, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v26 :: v_dual_mov_b32 v54, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v65 :: v_dual_mov_b32 v29, v64
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v30, v66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB47_2
;
; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v15i64_scalar:
@@ -38144,204 +39424,388 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr54
; GFX9-NEXT: s_branch .LBB49_2
;
-; GFX11-LABEL: bitcast_v15f64_to_v60i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-NEXT: v_dual_mov_b32 v30, s0 :: v_dual_mov_b32 v31, s1
-; GFX11-NEXT: v_dual_mov_b32 v28, s2 :: v_dual_mov_b32 v29, s3
-; GFX11-NEXT: v_dual_mov_b32 v26, s16 :: v_dual_mov_b32 v27, s17
-; GFX11-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v25, s19
-; GFX11-NEXT: v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v23, s21
-; GFX11-NEXT: v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v21, s23
-; GFX11-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v19, s25
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB49_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v31
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v30
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB49_3
-; GFX11-NEXT: .LBB49_2: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
-; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
-; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
-; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
-; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
-; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
-; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
-; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
-; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0
-; GFX11-NEXT: v_add_f64 v[30:31], v[30:31], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v31
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v30
-; GFX11-NEXT: .LBB49_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29
-; GFX11-NEXT: v_lshl_or_b32 v34, v34, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_lshl_or_b32 v48, v48, 16, v26
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_lshl_or_b32 v13, v13, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v19, v68, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v37, v37, 16, v31
-; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30
-; GFX11-NEXT: v_lshl_or_b32 v39, v39, 16, v29
-; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28
-; GFX11-NEXT: v_lshl_or_b32 v31, v82, 16, v25
-; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v27
-; GFX11-NEXT: v_lshl_or_b32 v33, v33, 16, v23
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT: v_lshl_or_b32 v35, v35, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_lshl_or_b32 v12, v12, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v18, v69, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v6
-; GFX11-NEXT: v_lshl_or_b32 v21, v66, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v8
-; GFX11-NEXT: v_lshl_or_b32 v38, v38, 16, v28
-; GFX11-NEXT: v_lshl_or_b32 v32, v32, 16, v22
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_lshl_or_b32 v20, v67, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v22, v65, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v23, v64, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v7
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v9
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v10
-; GFX11-NEXT: v_mov_b32_e32 v7, v31
-; GFX11-NEXT: v_lshl_or_b32 v49, v49, 16, v26
-; GFX11-NEXT: v_lshl_or_b32 v26, v53, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v37
-; GFX11-NEXT: v_lshl_or_b32 v36, v36, 16, v30
-; GFX11-NEXT: v_mov_b32_e32 v9, v33
-; GFX11-NEXT: v_lshl_or_b32 v30, v83, 16, v24
-; GFX11-NEXT: v_lshl_or_b32 v24, v55, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v11
-; GFX11-NEXT: v_lshl_or_b32 v14, v81, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v15, v80, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v16, v71, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v17, v70, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v25, v54, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v27, v52, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v28, v51, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v29, v50, 16, v4
-; GFX11-NEXT: v_mov_b32_e32 v0, v36
-; GFX11-NEXT: v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39
-; GFX11-NEXT: v_dual_mov_b32 v4, v48 :: v_dual_mov_b32 v5, v49
-; GFX11-NEXT: v_mov_b32_e32 v6, v30
-; GFX11-NEXT: v_mov_b32_e32 v8, v32
-; GFX11-NEXT: v_dual_mov_b32 v10, v34 :: v_dual_mov_b32 v11, v35
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB49_4:
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr83
-; GFX11-NEXT: ; implicit-def: $vgpr82
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr12
-; GFX11-NEXT: ; implicit-def: $vgpr13
-; GFX11-NEXT: ; implicit-def: $vgpr81
-; GFX11-NEXT: ; implicit-def: $vgpr80
-; GFX11-NEXT: ; implicit-def: $vgpr71
-; GFX11-NEXT: ; implicit-def: $vgpr70
-; GFX11-NEXT: ; implicit-def: $vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr68
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: s_branch .LBB49_2
+; GFX11-TRUE16-LABEL: bitcast_v15f64_to_v60i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_3
+; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[28:29], v[28:29], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-TRUE16-NEXT: .LBB49_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v83, v83 :: v_dual_mov_b32 v82, v82
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v81, v81 :: v_dual_mov_b32 v80, v80
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v71 :: v_dual_mov_b32 v70, v70
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v69, v69 :: v_dual_mov_b32 v68, v68
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v83.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v82.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v81.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v80.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v71.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v70.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v69.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v68.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v30.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB49_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: s_branch .LBB49_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v15f64_to_v60i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s0 :: v_dual_mov_b32 v31, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s2 :: v_dual_mov_b32 v29, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s16 :: v_dual_mov_b32 v27, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v25, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v23, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v21, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v19, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v31
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_3
+; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[28:29], v[28:29], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[30:31], v[30:31], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v31
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30
+; GFX11-FAKE16-NEXT: .LBB49_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v34, 16, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v48, v48, 16, v26
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v68, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v37, 16, v31
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v39, v39, 16, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v82, 16, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v27
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v35, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v12, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v69, 16, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v66, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v38, v38, 16, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v67, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v65, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v64, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v31
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v49, v49, 16, v26
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v53, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v37
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v36, 16, v30
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v83, 16, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v55, 16, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v81, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v80, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v71, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v70, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v54, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v52, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v51, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v50, 16, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v36
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v48 :: v_dual_mov_b32 v5, v49
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v30
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v32
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, v34 :: v_dual_mov_b32 v11, v35
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB49_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr12
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: s_branch .LBB49_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -40632,153 +42096,305 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v15f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v11
-; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v191, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v6 :: v_dual_mov_b32 v185, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v3 :: v_dual_mov_b32 v187, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v1 :: v_dual_mov_b32 v189, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3
; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: .LBB51_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v189
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v30
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB51_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v29 :: v_dual_mov_b32 v65, v28
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v66, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v26 :: v_dual_mov_b32 v54, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v65 :: v_dual_mov_b32 v29, v64
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v30, v66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB51_2
;
; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v15f64_scalar:
@@ -43227,204 +44843,388 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
; GFX9-NEXT: ; implicit-def: $vgpr54
; GFX9-NEXT: s_branch .LBB53_2
;
-; GFX11-LABEL: bitcast_v15f64_to_v60f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-NEXT: v_dual_mov_b32 v30, s0 :: v_dual_mov_b32 v31, s1
-; GFX11-NEXT: v_dual_mov_b32 v28, s2 :: v_dual_mov_b32 v29, s3
-; GFX11-NEXT: v_dual_mov_b32 v26, s16 :: v_dual_mov_b32 v27, s17
-; GFX11-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v25, s19
-; GFX11-NEXT: v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v23, s21
-; GFX11-NEXT: v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v21, s23
-; GFX11-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v19, s25
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB53_4
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v31
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v30
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB53_3
-; GFX11-NEXT: .LBB53_2: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
-; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
-; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
-; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
-; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
-; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
-; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
-; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
-; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0
-; GFX11-NEXT: v_add_f64 v[30:31], v[30:31], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v31
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v30
-; GFX11-NEXT: .LBB53_3: ; %end
-; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29
-; GFX11-NEXT: v_lshl_or_b32 v34, v34, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT: v_lshl_or_b32 v48, v48, 16, v26
-; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT: v_lshl_or_b32 v13, v13, 16, v19
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v19, v68, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT: v_lshl_or_b32 v37, v37, 16, v31
-; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30
-; GFX11-NEXT: v_lshl_or_b32 v39, v39, 16, v29
-; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28
-; GFX11-NEXT: v_lshl_or_b32 v31, v82, 16, v25
-; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v27
-; GFX11-NEXT: v_lshl_or_b32 v33, v33, 16, v23
-; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT: v_lshl_or_b32 v35, v35, 16, v21
-; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT: v_lshl_or_b32 v12, v12, 16, v18
-; GFX11-NEXT: v_lshl_or_b32 v18, v69, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v4
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v5
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v6
-; GFX11-NEXT: v_lshl_or_b32 v21, v66, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v8
-; GFX11-NEXT: v_lshl_or_b32 v38, v38, 16, v28
-; GFX11-NEXT: v_lshl_or_b32 v32, v32, 16, v22
-; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_lshl_or_b32 v20, v67, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v22, v65, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v23, v64, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v7
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v9
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v10
-; GFX11-NEXT: v_mov_b32_e32 v7, v31
-; GFX11-NEXT: v_lshl_or_b32 v49, v49, 16, v26
-; GFX11-NEXT: v_lshl_or_b32 v26, v53, 16, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v37
-; GFX11-NEXT: v_lshl_or_b32 v36, v36, 16, v30
-; GFX11-NEXT: v_mov_b32_e32 v9, v33
-; GFX11-NEXT: v_lshl_or_b32 v30, v83, 16, v24
-; GFX11-NEXT: v_lshl_or_b32 v24, v55, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v11
-; GFX11-NEXT: v_lshl_or_b32 v14, v81, 16, v14
-; GFX11-NEXT: v_lshl_or_b32 v15, v80, 16, v15
-; GFX11-NEXT: v_lshl_or_b32 v16, v71, 16, v16
-; GFX11-NEXT: v_lshl_or_b32 v17, v70, 16, v17
-; GFX11-NEXT: v_lshl_or_b32 v25, v54, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v27, v52, 16, v2
-; GFX11-NEXT: v_lshl_or_b32 v28, v51, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v29, v50, 16, v4
-; GFX11-NEXT: v_mov_b32_e32 v0, v36
-; GFX11-NEXT: v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39
-; GFX11-NEXT: v_dual_mov_b32 v4, v48 :: v_dual_mov_b32 v5, v49
-; GFX11-NEXT: v_mov_b32_e32 v6, v30
-; GFX11-NEXT: v_mov_b32_e32 v8, v32
-; GFX11-NEXT: v_dual_mov_b32 v10, v34 :: v_dual_mov_b32 v11, v35
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB53_4:
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr83
-; GFX11-NEXT: ; implicit-def: $vgpr82
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr12
-; GFX11-NEXT: ; implicit-def: $vgpr13
-; GFX11-NEXT: ; implicit-def: $vgpr81
-; GFX11-NEXT: ; implicit-def: $vgpr80
-; GFX11-NEXT: ; implicit-def: $vgpr71
-; GFX11-NEXT: ; implicit-def: $vgpr70
-; GFX11-NEXT: ; implicit-def: $vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr68
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: s_branch .LBB53_2
+; GFX11-TRUE16-LABEL: bitcast_v15f64_to_v60f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB53_4
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB53_3
+; GFX11-TRUE16-NEXT: .LBB53_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[28:29], v[28:29], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-TRUE16-NEXT: .LBB53_3: ; %end
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v83, v83 :: v_dual_mov_b32 v82, v82
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v81, v81 :: v_dual_mov_b32 v80, v80
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v71 :: v_dual_mov_b32 v70, v70
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v69, v69 :: v_dual_mov_b32 v68, v68
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v83.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v82.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v81.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v80.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v71.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v70.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v69.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v68.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v30.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB53_4:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT: s_branch .LBB53_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v15f64_to_v60f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s0 :: v_dual_mov_b32 v31, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s2 :: v_dual_mov_b32 v29, s3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s16 :: v_dual_mov_b32 v27, s17
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v25, s19
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v23, s21
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v21, s23
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v19, s25
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB53_4
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v31
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB53_3
+; GFX11-FAKE16-NEXT: .LBB53_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[28:29], v[28:29], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[30:31], v[30:31], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v31
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30
+; GFX11-FAKE16-NEXT: .LBB53_3: ; %end
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v34, 16, v20
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v48, v48, 16, v26
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v19
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v68, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v37, 16, v31
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v39, v39, 16, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v82, 16, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v27
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v23
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v35, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v12, 16, v18
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v69, 16, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v66, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v8
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v38, v38, 16, v28
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v67, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v65, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v64, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v10
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v31
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v49, v49, 16, v26
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v53, 16, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v37
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v36, 16, v30
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, v33
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v83, 16, v24
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v55, 16, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v11
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v81, 16, v14
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v80, 16, v15
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v71, 16, v16
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v70, 16, v17
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v54, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v52, 16, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v51, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v50, 16, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v36
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v48 :: v_dual_mov_b32 v5, v49
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v30
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v32
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, v34 :: v_dual_mov_b32 v11, v35
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB53_4:
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr12
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: s_branch .LBB53_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -46109,153 +47909,305 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v15f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v11
-; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v191, s32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v191, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v6 :: v_dual_mov_b32 v185, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v3 :: v_dual_mov_b32 v187, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v1 :: v_dual_mov_b32 v189, v0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB55_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB55_3
; GFX11-TRUE16-NEXT: .LBB55_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: .LBB55_3: ; %end
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v189
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT: s_clause 0xf
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v30
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB55_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v29 :: v_dual_mov_b32 v65, v28
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v66, v30
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v26 :: v_dual_mov_b32 v54, v25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v65 :: v_dual_mov_b32 v29, v64
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v30, v66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
; GFX11-TRUE16-NEXT: s_branch .LBB55_2
;
; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v15f64_scalar:
@@ -49421,31 +51373,14 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v60f16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v29.h
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, 0
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
@@ -49467,34 +51402,33 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v29.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v27.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v26.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v25.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v24.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v23.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v22.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v21.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v20.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v30.h
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_4
; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v29, 16, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v28, 16, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v27, 16, v9
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v26, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v25, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v24, 16, v6
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v23, 16, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v22, 16, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s43
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s42
@@ -49513,79 +51447,91 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s29, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s29, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s28, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s27, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s26, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v34, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v31, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v32, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v33, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v49, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v48, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v39, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v38, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v37, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v36, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v48
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v39
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v38
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v37
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v36
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v49
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v33
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v35
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v34
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v14
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29
; GFX11-TRUE16-NEXT: s_branch .LBB57_5
; GFX11-TRUE16-NEXT: .LBB57_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
; GFX11-TRUE16-NEXT: s_branch .LBB57_2
; GFX11-TRUE16-NEXT: .LBB57_4:
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s29 :: v_dual_mov_b32 v16, s28
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s27 :: v_dual_mov_b32 v12, s26
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s24
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s23 :: v_dual_mov_b32 v35, s22
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s21 :: v_dual_mov_b32 v31, s20
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s19 :: v_dual_mov_b32 v33, s18
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, s17 :: v_dual_mov_b32 v36, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, s3 :: v_dual_mov_b32 v38, s2
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, s1 :: v_dual_mov_b32 v48, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s44 :: v_dual_mov_b32 v51, s43
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s42 :: v_dual_mov_b32 v53, s41
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s40 :: v_dual_mov_b32 v55, s15
@@ -49596,75 +51542,52 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v80, s5 :: v_dual_mov_b32 v81, s7
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v82, s4 :: v_dual_mov_b32 v83, s45
; GFX11-TRUE16-NEXT: .LBB57_5: ; %end
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v49
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v49, v70, 16, v49
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v84, 0xffff, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v82, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v39, v80, 16, v84
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v38
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v48
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v22, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v28, 16, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v38, v81, 16, v38
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v85, 0xffff, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v80, 0xffff, v30
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v27, 16, v2
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v36, v83, 16, v48
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v48, v71, 16, v85
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff, v31
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v69, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v68, 16, v32
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v66, 16, v80
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v67, 16, v71
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v65, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v35, v64, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v55, 16, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v53, 16, v67
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v54, 16, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v52, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v51, 16, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v50, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v4
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v36
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v48
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v30 :: v_dual_mov_b32 v7, v31
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v32 :: v_dual_mov_b32 v9, v33
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, v34 :: v_dual_mov_b32 v11, v35
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v83, v83 :: v_dual_mov_b32 v82, v82
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v81, v81 :: v_dual_mov_b32 v80, v80
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v71 :: v_dual_mov_b32 v70, v70
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v69, v69 :: v_dual_mov_b32 v68, v68
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v83.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v82.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v81.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v80.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v71.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v70.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v69.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v68.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v30.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v60f16_scalar:
@@ -52368,31 +54291,14 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v60i16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v29.h
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, 0
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
@@ -52414,34 +54320,33 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v29.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v27.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v26.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v25.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v24.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v23.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v22.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v21.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v20.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v30.h
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_4
; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v29, 16, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v28, 16, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v27, 16, v9
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v26, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v25, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v24, 16, v6
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v23, 16, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v22, 16, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s43
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s42
@@ -52460,79 +54365,91 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s29 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s29 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s28 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s27 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s26 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v34, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v31, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v32, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v33, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v49, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v48, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v39, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v38, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v37, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v36, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v48
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v39
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v38
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v37
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v36
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v49
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v33
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v35
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v34
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v14
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29
; GFX11-TRUE16-NEXT: s_branch .LBB59_5
; GFX11-TRUE16-NEXT: .LBB59_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30
; GFX11-TRUE16-NEXT: s_branch .LBB59_2
; GFX11-TRUE16-NEXT: .LBB59_4:
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s29 :: v_dual_mov_b32 v16, s28
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s27 :: v_dual_mov_b32 v12, s26
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s24
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s23 :: v_dual_mov_b32 v35, s22
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s21 :: v_dual_mov_b32 v31, s20
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s19 :: v_dual_mov_b32 v33, s18
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, s17 :: v_dual_mov_b32 v36, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, s3 :: v_dual_mov_b32 v38, s2
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, s1 :: v_dual_mov_b32 v48, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, s44 :: v_dual_mov_b32 v51, s43
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s42 :: v_dual_mov_b32 v53, s41
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s40 :: v_dual_mov_b32 v55, s15
@@ -52543,75 +54460,52 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v80, s5 :: v_dual_mov_b32 v81, s7
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v82, s4 :: v_dual_mov_b32 v83, s45
; GFX11-TRUE16-NEXT: .LBB59_5: ; %end
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v49
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v49, v70, 16, v49
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v84, 0xffff, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v82, 16, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v39, v80, 16, v84
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v38
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v48
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v22, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v28, 16, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v38, v81, 16, v38
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v85, 0xffff, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v80, 0xffff, v30
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v27, 16, v2
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v36, v83, 16, v48
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v38
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v48, v71, 16, v85
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff, v31
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v69, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v68, 16, v32
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v66, 16, v80
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v67, 16, v71
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v65, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v35, v64, 16, v66
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v55, 16, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v53, 16, v67
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v54, 16, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v52, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v51, 16, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v50, 16, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v4
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v36
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v48
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v30 :: v_dual_mov_b32 v7, v31
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v32 :: v_dual_mov_b32 v9, v33
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, v34 :: v_dual_mov_b32 v11, v35
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v83, v83 :: v_dual_mov_b32 v82, v82
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v81, v81 :: v_dual_mov_b32 v80, v80
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v71 :: v_dual_mov_b32 v70, v70
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v69, v69 :: v_dual_mov_b32 v68, v68
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v83.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v82.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v81.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v80.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v71.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v70.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v69.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v68.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v30.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v60i16_scalar:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
index 685e2fb..6ada0cb 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
@@ -1104,16 +1104,15 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v5.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v11.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12
@@ -1128,37 +1127,28 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB6_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v0.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v4.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v4.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v1.h, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v2.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v2
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2
; GFX11-TRUE16-NEXT: .LBB6_4: ; %cmp.true
@@ -1166,36 +1156,26 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v7.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.h, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v4.l, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v5.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.l, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v3.h, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -2422,89 +2402,171 @@ define inreg <3 x i32> @bitcast_v6bf16_to_v3i32_scalar(<6 x bfloat> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v2, s18
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v6bf16_to_v3i32_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s3, 0
-; GFX11-NEXT: s_mov_b32 s3, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB11_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3
-; GFX11-NEXT: s_cbranch_vccnz .LBB11_4
-; GFX11-NEXT: .LBB11_2: ; %cmp.true
-; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s2
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s3
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s2
-; GFX11-NEXT: s_pack_lh_b32_b16 s2, 0, s0
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s1
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s3
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s2
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v9, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v10, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v6
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v11 :: v_dual_and_b32 v1, 0xffff, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v9
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v8, v12, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v2, v0, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v7
-; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshl_or_b32 v0, v4, 16, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB11_3:
-; GFX11-NEXT: s_branch .LBB11_2
-; GFX11-NEXT: .LBB11_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_mov_b32_e32 v2, s2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v6bf16_to_v3i32_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB11_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB11_4
+; GFX11-TRUE16-NEXT: .LBB11_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s3, 0, s2
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v9, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v7, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v1, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v6.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB11_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB11_2
+; GFX11-TRUE16-NEXT: .LBB11_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v6bf16_to_v3i32_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB11_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB11_4
+; GFX11-FAKE16-NEXT: .LBB11_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s2
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v10, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v6
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v11 :: v_dual_and_b32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v9
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v8, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v0, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v4, 16, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB11_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB11_2
+; GFX11-FAKE16-NEXT: .LBB11_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -4254,16 +4316,15 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v5.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v11.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12
@@ -4278,37 +4339,28 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB22_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v0.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v4.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v4.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v1.h, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v2.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v2
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2
; GFX11-TRUE16-NEXT: .LBB22_4: ; %cmp.true
@@ -4316,36 +4368,26 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v7.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.h, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v4.l, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v5.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.l, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v3.h, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -5576,89 +5618,171 @@ define inreg <3 x float> @bitcast_v6bf16_to_v3f32_scalar(<6 x bfloat> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v2, s18
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v6bf16_to_v3f32_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s3, 0
-; GFX11-NEXT: s_mov_b32 s3, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB27_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3
-; GFX11-NEXT: s_cbranch_vccnz .LBB27_4
-; GFX11-NEXT: .LBB27_2: ; %cmp.true
-; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s2
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s3
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s2
-; GFX11-NEXT: s_pack_lh_b32_b16 s2, 0, s0
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s1
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s3
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s2
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v9, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v10, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v4
-; GFX11-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v6
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v11 :: v_dual_and_b32 v1, 0xffff, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v9
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v8, v12, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v2
-; GFX11-NEXT: v_lshl_or_b32 v2, v0, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v7
-; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshl_or_b32 v0, v4, 16, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB27_3:
-; GFX11-NEXT: s_branch .LBB27_2
-; GFX11-NEXT: .LBB27_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_mov_b32_e32 v2, s2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v6bf16_to_v3f32_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB27_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB27_4
+; GFX11-TRUE16-NEXT: .LBB27_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s3, 0, s2
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v9, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v7, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v1, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v6.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB27_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB27_2
+; GFX11-TRUE16-NEXT: .LBB27_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v6bf16_to_v3f32_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB27_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB27_4
+; GFX11-FAKE16-NEXT: .LBB27_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s2
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v10, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v6
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v11 :: v_dual_and_b32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v9
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v8, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v0, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v4, 16, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB27_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB27_2
+; GFX11-FAKE16-NEXT: .LBB27_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -6909,12 +7033,12 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v9.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12
@@ -6929,37 +7053,28 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB36_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v9.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v1.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v4.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v2.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB36_2
; GFX11-TRUE16-NEXT: .LBB36_4: ; %cmp.true
@@ -6967,36 +7082,26 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v8.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v6.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.h, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -8288,124 +8393,243 @@ define inreg <12 x i8> @bitcast_v6bf16_to_v12i8_scalar(<6 x bfloat> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v4, s17
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v6bf16_to_v12i8_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s3, 0
-; GFX11-NEXT: s_mov_b32 s3, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB39_3
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: s_lshr_b32 s13, s2, 16
-; GFX11-NEXT: s_lshr_b32 s12, s2, 8
-; GFX11-NEXT: s_lshr_b32 s8, s1, 24
-; GFX11-NEXT: s_lshr_b32 s14, s1, 16
-; GFX11-NEXT: s_lshr_b32 s9, s1, 8
-; GFX11-NEXT: s_lshr_b32 s11, s0, 16
-; GFX11-NEXT: s_lshr_b32 s10, s0, 8
-; GFX11-NEXT: s_lshr_b64 s[6:7], s[2:3], 24
-; GFX11-NEXT: s_lshr_b64 s[4:5], s[0:1], 24
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3
-; GFX11-NEXT: s_cbranch_vccnz .LBB39_4
-; GFX11-NEXT: .LBB39_2: ; %cmp.true
-; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s1
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s3
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
-; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s0
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s1, 0, s2
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s3
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s2
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v8
-; GFX11-NEXT: v_mov_b32_e32 v12, 0x7fc07fc0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v9, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v0, v8, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v2, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v10, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_cndmask_b32_e32 v8, v0, v11, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3
-; GFX11-NEXT: v_bfe_u32 v3, v7, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v9 :: v_dual_add_nc_u32 v3, v3, v7
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v13
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v3
-; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_lshl_or_b32 v11, v7, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2]
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v11
-; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12]
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-NEXT: v_mov_b32_e32 v4, v13
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB39_3:
-; GFX11-NEXT: ; implicit-def: $sgpr10
-; GFX11-NEXT: ; implicit-def: $sgpr11
-; GFX11-NEXT: ; implicit-def: $sgpr4
-; GFX11-NEXT: ; implicit-def: $sgpr9
-; GFX11-NEXT: ; implicit-def: $sgpr14
-; GFX11-NEXT: ; implicit-def: $sgpr8
-; GFX11-NEXT: ; implicit-def: $sgpr12
-; GFX11-NEXT: ; implicit-def: $sgpr13
-; GFX11-NEXT: ; implicit-def: $sgpr6
-; GFX11-NEXT: s_branch .LBB39_2
-; GFX11-NEXT: .LBB39_4:
-; GFX11-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, s12
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s10
-; GFX11-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s8
-; GFX11-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v5, s9
-; GFX11-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v11, s6
-; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v6bf16_to_v12i8_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB39_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s2, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s1, 24
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s1, 8
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s0, 8
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB39_4
+; GFX11-TRUE16-NEXT: .LBB39_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s1, 0, s0
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s3, 0, s2
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v4, 16, 1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v12, 0x7fc07fc0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v8, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v0, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v13.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v6.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v4.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v3.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12]
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v13
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB39_3:
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13
+; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6
+; GFX11-TRUE16-NEXT: s_branch .LBB39_2
+; GFX11-TRUE16-NEXT: .LBB39_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v5, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v11, s6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v6bf16_to_v12i8_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB39_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s2, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s2, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s1, 24
+; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s1, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s1, 8
+; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s0, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s0, 8
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB39_4
+; GFX11-FAKE16-NEXT: .LBB39_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s1, 0, s2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, 0x7fc07fc0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v0, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v2, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v0, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v10, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v0, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v9 :: v_dual_add_nc_u32 v3, v3, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v13
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v6, 16, v3
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v4, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v7, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v13
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB39_3:
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13
+; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6
+; GFX11-FAKE16-NEXT: s_branch .LBB39_2
+; GFX11-FAKE16-NEXT: .LBB39_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, s12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s10
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s8
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v5, s9
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v11, s6
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -8669,12 +8893,12 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v9.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12
@@ -8689,37 +8913,28 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB40_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v9.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v1.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v4.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v2.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB40_2
; GFX11-TRUE16-NEXT: .LBB40_4: ; %cmp.true
@@ -8727,36 +8942,26 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v8.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v6.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.h, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -10079,12 +10284,12 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v9.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12
@@ -10099,37 +10304,28 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB44_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v9.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v1.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v4.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v2.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB44_2
; GFX11-TRUE16-NEXT: .LBB44_4: ; %cmp.true
@@ -10137,36 +10333,26 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v8.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v6.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.h, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -11809,89 +11995,169 @@ define inreg <6 x half> @bitcast_v6bf16_to_v6f16_scalar(<6 x bfloat> inreg %a, i
; GFX9-NEXT: v_mov_b32_e32 v3, s19
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v6bf16_to_v6f16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s3, 0
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB49_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB49_4
-; GFX11-NEXT: .LBB49_2: ; %cmp.true
-; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s0
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s3
-; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
-; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s1
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: s_pack_lh_b32_b16 s4, 0, s2
-; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s1
-; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s2
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v2
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s3
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v8, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v1, v4, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v9 :: v_dual_add_nc_u32 v1, v1, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v10
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v8, v12, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v9
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v10, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v7
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v6
-; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_lshl_or_b32 v2, v4, 16, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB49_3:
-; GFX11-NEXT: s_branch .LBB49_2
-; GFX11-NEXT: .LBB49_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v6bf16_to_v6f16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_4
+; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s3, 0, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s2
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v10, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v0, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v10, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v4.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v6.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB49_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB49_2
+; GFX11-TRUE16-NEXT: .LBB49_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v6bf16_to_v6f16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_4
+; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s4, 0, s2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v6, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v10, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v6, v9 :: v_dual_add_nc_u32 v1, v1, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v10
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v8, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v9
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v6, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v6
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v4, 16, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB49_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB49_2
+; GFX11-FAKE16-NEXT: .LBB49_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -12403,64 +12669,57 @@ define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v4
-; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v9, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc_lo
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9
-; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v6
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v10, v9, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_add3_u32 v9, v11, v3, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v10, v1, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v9
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_add3_u32 v11, v12, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v9, v10 :: v_dual_add_f32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v12, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v0, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v0, 16, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v1, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v5, 16, v4
+; GFX11-TRUE16-NEXT: v_add3_u32 v2, v8, v0, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v8, v9, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v7, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v5.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h
; GFX11-TRUE16-NEXT: .LBB52_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -12748,80 +13007,151 @@ define inreg <6 x i16> @bitcast_v6bf16_to_v6i16_scalar(<6 x bfloat> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v3, s19
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v6bf16_to_v6i16_scalar:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s3, 0
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB53_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_vccnz .LBB53_4
-; GFX11-NEXT: .LBB53_2: ; %cmp.true
-; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s0
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s3
-; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s1
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s3
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s2
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s2
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v8 :: v_dual_add_nc_u32 v7, v7, v3
-; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v4, 0x7fff, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v9, v10, v6
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v11 :: v_dual_add_nc_u32 v7, v7, v8
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v7
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v8
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v9, v12, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v10, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v4, v5
-; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v3, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v7
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB53_3:
-; GFX11-NEXT: s_branch .LBB53_2
-; GFX11-NEXT: .LBB53_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v6bf16_to_v6i16_scalar:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB53_3
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB53_4
+; GFX11-TRUE16-NEXT: .LBB53_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s3, 0, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s3
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v4, v9 :: v_dual_add_nc_u32 v9, v10, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v2, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v8, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v1, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v5.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v6.h
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT: .LBB53_3:
+; GFX11-TRUE16-NEXT: s_branch .LBB53_2
+; GFX11-TRUE16-NEXT: .LBB53_4:
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v6bf16_to_v6i16_scalar:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB53_3
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB53_4
+; GFX11-FAKE16-NEXT: .LBB53_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s0, 0, s2
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v2, v8 :: v_dual_add_nc_u32 v7, v7, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v4, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v10, v6
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v11 :: v_dual_add_nc_u32 v7, v7, v8
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v9, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v4, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v4, v5
+; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v3, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v7
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT: .LBB53_3:
+; GFX11-FAKE16-NEXT: s_branch .LBB53_2
+; GFX11-FAKE16-NEXT: .LBB53_4:
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
index e71bf15..e34aaf20 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
@@ -136,7 +136,7 @@ define i32 @select_sdiv_lhs_opaque_const0_i32(i1 %cond) {
; GCN-NEXT: v_mov_b32_e32 v1, s4
; GCN-NEXT: v_cndmask_b32_e32 v0, 5, v1, vcc
; GCN-NEXT: v_sub_u32_e32 v1, vcc, 0, v0
-; GCN-NEXT: v_max_i32_e32 v1, v0, v1
+; GCN-NEXT: v_max_i32_e32 v1, v1, v0
; GCN-NEXT: v_cvt_f32_u32_e32 v2, v1
; GCN-NEXT: v_sub_u32_e32 v3, vcc, 0, v1
; GCN-NEXT: s_mov_b32 s4, 0xf4240
@@ -218,7 +218,7 @@ define i32 @select_sdiv_lhs_opaque_const1_i32(i1 %cond) {
; GCN-NEXT: v_mov_b32_e32 v1, s4
; GCN-NEXT: v_cndmask_b32_e64 v0, v1, 5, vcc
; GCN-NEXT: v_sub_u32_e32 v1, vcc, 0, v0
-; GCN-NEXT: v_max_i32_e32 v1, v0, v1
+; GCN-NEXT: v_max_i32_e32 v1, v1, v0
; GCN-NEXT: v_cvt_f32_u32_e32 v2, v1
; GCN-NEXT: v_sub_u32_e32 v3, vcc, 0, v1
; GCN-NEXT: s_mov_b32 s4, 0xf4240
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index e27164c..948811e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -6191,37 +6191,34 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3
-; GFX6-NEXT: s_ashr_i32 s8, s3, 31
-; GFX6-NEXT: s_add_i32 s3, s3, s8
-; GFX6-NEXT: s_xor_b32 s3, s3, s8
-; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3
-; GFX6-NEXT: s_sub_i32 s4, 0, s3
-; GFX6-NEXT: s_ashr_i32 s9, s2, 31
-; GFX6-NEXT: s_add_i32 s2, s2, s9
-; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT: s_xor_b32 s2, s2, s9
+; GFX6-NEXT: s_abs_i32 s8, s3
+; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8
+; GFX6-NEXT: s_sub_i32 s4, 0, s8
+; GFX6-NEXT: s_abs_i32 s9, s2
; GFX6-NEXT: s_mov_b32 s5, s1
+; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0
; GFX6-NEXT: s_mov_b32 s4, s0
; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0
+; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0
; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: s_mul_i32 s0, s0, s3
-; GFX6-NEXT: s_sub_i32 s0, s2, s0
-; GFX6-NEXT: s_sub_i32 s1, s0, s3
+; GFX6-NEXT: s_mul_i32 s0, s0, s8
+; GFX6-NEXT: s_sub_i32 s0, s9, s0
+; GFX6-NEXT: s_sub_i32 s1, s0, s8
; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0
-; GFX6-NEXT: s_cmp_ge_u32 s0, s3
+; GFX6-NEXT: s_cmp_ge_u32 s0, s8
; GFX6-NEXT: s_cselect_b64 vcc, -1, 0
; GFX6-NEXT: s_cselect_b32 s0, s1, s0
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0
-; GFX6-NEXT: s_cmp_ge_u32 s0, s3
+; GFX6-NEXT: s_cmp_ge_u32 s0, s8
; GFX6-NEXT: s_cselect_b64 vcc, -1, 0
+; GFX6-NEXT: s_xor_b32 s0, s2, s3
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX6-NEXT: s_xor_b32 s0, s9, s8
+; GFX6-NEXT: s_ashr_i32 s0, s0, 31
; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0
; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
@@ -6233,35 +6230,32 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3
-; GFX9-NEXT: s_ashr_i32 s4, s3, 31
-; GFX9-NEXT: s_add_i32 s3, s3, s4
-; GFX9-NEXT: s_xor_b32 s3, s3, s4
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT: s_sub_i32 s6, 0, s3
-; GFX9-NEXT: s_ashr_i32 s5, s2, 31
-; GFX9-NEXT: s_add_i32 s2, s2, s5
+; GFX9-NEXT: s_abs_i32 s4, s3
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4
+; GFX9-NEXT: s_sub_i32 s6, 0, s4
+; GFX9-NEXT: s_abs_i32 s5, s2
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_xor_b32 s2, s2, s5
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_readfirstlane_b32 s7, v0
; GFX9-NEXT: s_mul_i32 s6, s6, s7
; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6
; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_mul_hi_u32 s6, s2, s7
-; GFX9-NEXT: s_mul_i32 s8, s6, s3
-; GFX9-NEXT: s_sub_i32 s2, s2, s8
+; GFX9-NEXT: s_mul_hi_u32 s6, s5, s7
+; GFX9-NEXT: s_mul_i32 s8, s6, s4
+; GFX9-NEXT: s_sub_i32 s5, s5, s8
; GFX9-NEXT: s_add_i32 s7, s6, 1
-; GFX9-NEXT: s_sub_i32 s8, s2, s3
-; GFX9-NEXT: s_cmp_ge_u32 s2, s3
+; GFX9-NEXT: s_sub_i32 s8, s5, s4
+; GFX9-NEXT: s_cmp_ge_u32 s5, s4
; GFX9-NEXT: s_cselect_b32 s6, s7, s6
-; GFX9-NEXT: s_cselect_b32 s2, s8, s2
+; GFX9-NEXT: s_cselect_b32 s5, s8, s5
; GFX9-NEXT: s_add_i32 s7, s6, 1
-; GFX9-NEXT: s_cmp_ge_u32 s2, s3
-; GFX9-NEXT: s_cselect_b32 s2, s7, s6
-; GFX9-NEXT: s_xor_b32 s3, s5, s4
+; GFX9-NEXT: s_cmp_ge_u32 s5, s4
+; GFX9-NEXT: s_cselect_b32 s4, s7, s6
; GFX9-NEXT: s_xor_b32 s2, s2, s3
-; GFX9-NEXT: s_sub_i32 s2, s2, s3
+; GFX9-NEXT: s_ashr_i32 s2, s2, 31
+; GFX9-NEXT: s_xor_b32 s3, s4, s2
+; GFX9-NEXT: s_sub_i32 s2, s3, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
@@ -6706,38 +6700,37 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
; GFX6-LABEL: srem_i32_pow2_shl_denom:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3
-; GFX6-NEXT: s_ashr_i32 s4, s3, 31
-; GFX6-NEXT: s_add_i32 s3, s3, s4
-; GFX6-NEXT: s_xor_b32 s4, s3, s4
-; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s4
-; GFX6-NEXT: s_sub_i32 s3, 0, s4
-; GFX6-NEXT: s_ashr_i32 s5, s2, 31
-; GFX6-NEXT: s_add_i32 s2, s2, s5
+; GFX6-NEXT: s_abs_i32 s3, s3
+; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3
+; GFX6-NEXT: s_sub_i32 s4, 0, s3
+; GFX6-NEXT: s_abs_i32 s8, s2
+; GFX6-NEXT: s_mov_b32 s5, s1
; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT: s_xor_b32 s6, s2, s5
-; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0
-; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0
+; GFX6-NEXT: s_mov_b32 s4, s0
; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0
-; GFX6-NEXT: v_readfirstlane_b32 s7, v0
-; GFX6-NEXT: s_mul_i32 s7, s7, s4
-; GFX6-NEXT: s_sub_i32 s6, s6, s7
-; GFX6-NEXT: s_sub_i32 s7, s6, s4
-; GFX6-NEXT: s_cmp_ge_u32 s6, s4
-; GFX6-NEXT: s_cselect_b32 s6, s7, s6
-; GFX6-NEXT: s_sub_i32 s7, s6, s4
-; GFX6-NEXT: s_cmp_ge_u32 s6, s4
-; GFX6-NEXT: s_cselect_b32 s4, s7, s6
-; GFX6-NEXT: s_xor_b32 s4, s4, s5
-; GFX6-NEXT: s_sub_i32 s4, s4, s5
-; GFX6-NEXT: v_mov_b32_e32 v0, s4
-; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0
+; GFX6-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-NEXT: s_mul_i32 s0, s0, s3
+; GFX6-NEXT: s_sub_i32 s0, s8, s0
+; GFX6-NEXT: s_sub_i32 s1, s0, s3
+; GFX6-NEXT: s_cmp_ge_u32 s0, s3
+; GFX6-NEXT: s_cselect_b32 s0, s1, s0
+; GFX6-NEXT: s_sub_i32 s1, s0, s3
+; GFX6-NEXT: s_cmp_ge_u32 s0, s3
+; GFX6-NEXT: s_cselect_b32 s0, s1, s0
+; GFX6-NEXT: s_ashr_i32 s1, s2, 31
+; GFX6-NEXT: s_xor_b32 s0, s0, s1
+; GFX6-NEXT: s_sub_i32 s0, s0, s1
+; GFX6-NEXT: v_mov_b32_e32 v0, s0
+; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: srem_i32_pow2_shl_denom:
@@ -6746,32 +6739,29 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3
-; GFX9-NEXT: s_ashr_i32 s4, s3, 31
-; GFX9-NEXT: s_add_i32 s3, s3, s4
-; GFX9-NEXT: s_xor_b32 s3, s3, s4
+; GFX9-NEXT: s_abs_i32 s3, s3
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
; GFX9-NEXT: s_sub_i32 s5, 0, s3
-; GFX9-NEXT: s_ashr_i32 s4, s2, 31
-; GFX9-NEXT: s_add_i32 s2, s2, s4
+; GFX9-NEXT: s_abs_i32 s4, s2
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_xor_b32 s2, s2, s4
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_readfirstlane_b32 s6, v0
; GFX9-NEXT: s_mul_i32 s5, s5, s6
; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5
; GFX9-NEXT: s_add_i32 s6, s6, s5
-; GFX9-NEXT: s_mul_hi_u32 s5, s2, s6
+; GFX9-NEXT: s_mul_hi_u32 s5, s4, s6
; GFX9-NEXT: s_mul_i32 s5, s5, s3
-; GFX9-NEXT: s_sub_i32 s2, s2, s5
-; GFX9-NEXT: s_sub_i32 s5, s2, s3
-; GFX9-NEXT: s_cmp_ge_u32 s2, s3
-; GFX9-NEXT: s_cselect_b32 s2, s5, s2
-; GFX9-NEXT: s_sub_i32 s5, s2, s3
-; GFX9-NEXT: s_cmp_ge_u32 s2, s3
-; GFX9-NEXT: s_cselect_b32 s2, s5, s2
-; GFX9-NEXT: s_xor_b32 s2, s2, s4
-; GFX9-NEXT: s_sub_i32 s2, s2, s4
+; GFX9-NEXT: s_sub_i32 s4, s4, s5
+; GFX9-NEXT: s_sub_i32 s5, s4, s3
+; GFX9-NEXT: s_cmp_ge_u32 s4, s3
+; GFX9-NEXT: s_cselect_b32 s4, s5, s4
+; GFX9-NEXT: s_sub_i32 s5, s4, s3
+; GFX9-NEXT: s_cmp_ge_u32 s4, s3
+; GFX9-NEXT: s_cselect_b32 s3, s5, s4
+; GFX9-NEXT: s_ashr_i32 s2, s2, 31
+; GFX9-NEXT: s_xor_b32 s3, s3, s2
+; GFX9-NEXT: s_sub_i32 s2, s3, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll
index 861621b..c1b8bc6 100644
--- a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll
@@ -410,26 +410,14 @@ define void @undef_lo2_v4i16(<2 x i16> %arg0) {
; GFX11-FAKE16-NEXT: ;;#ASMEND
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-TRUE16-SDAG-LABEL: undef_lo2_v4i16:
-; GFX11-TRUE16-SDAG: ; %bb.0:
-; GFX11-TRUE16-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-SDAG-NEXT: v_mov_b16_e32 v1.l, v0.h
-; GFX11-TRUE16-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-SDAG-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-TRUE16-SDAG-NEXT: ;;#ASMSTART
-; GFX11-TRUE16-SDAG-NEXT: ; use v[0:1]
-; GFX11-TRUE16-SDAG-NEXT: ;;#ASMEND
-; GFX11-TRUE16-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-TRUE16-GISEL-LABEL: undef_lo2_v4i16:
-; GFX11-TRUE16-GISEL: ; %bb.0:
-; GFX11-TRUE16-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-GISEL-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-GISEL-NEXT: ;;#ASMSTART
-; GFX11-TRUE16-GISEL-NEXT: ; use v[0:1]
-; GFX11-TRUE16-GISEL-NEXT: ;;#ASMEND
-; GFX11-TRUE16-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: undef_lo2_v4i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: ;;#ASMSTART
+; GFX11-TRUE16-NEXT: ; use v[0:1]
+; GFX11-TRUE16-NEXT: ;;#ASMEND
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
%undef.lo = shufflevector <2 x i16> %arg0, <2 x i16> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 3>
call void asm sideeffect "; use $0", "v"(<4 x i16> %undef.lo);
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
index 3cf70c4..d7d697e 100644
--- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll
+++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
@@ -576,11 +576,11 @@ define i32 @sdiv32(i32 %a, i32 %b) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_sub_u32_e32 v2, 0, v1
-; GFX9-NEXT: v_max_i32_e32 v2, v1, v2
+; GFX9-NEXT: v_max_i32_e32 v2, v2, v1
; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v2
; GFX9-NEXT: v_sub_u32_e32 v4, 0, v2
; GFX9-NEXT: v_sub_u32_e32 v5, 0, v0
-; GFX9-NEXT: v_max_i32_e32 v5, v0, v5
+; GFX9-NEXT: v_max_i32_e32 v5, v5, v0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3
; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v0
@@ -640,11 +640,11 @@ define i32 @srem32(i32 %a, i32 %b) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_sub_u32_e32 v2, 0, v1
-; GFX9-NEXT: v_max_i32_e32 v1, v1, v2
+; GFX9-NEXT: v_max_i32_e32 v1, v2, v1
; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v1
; GFX9-NEXT: v_sub_u32_e32 v3, 0, v1
; GFX9-NEXT: v_sub_u32_e32 v4, 0, v0
-; GFX9-NEXT: v_max_i32_e32 v4, v0, v4
+; GFX9-NEXT: v_max_i32_e32 v4, v4, v0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v0
; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index cbf6b66..7dbbeaa 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -3632,13 +3632,9 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) {
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2
; GFX11-TRUE16-NEXT: global_store_b32 v[0:1], v0, off
; GFX11-TRUE16-NEXT: s_endpgm
;
@@ -3813,16 +3809,12 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) {
; GFX1250-TRUE16-NEXT: v_bitop3_b16 v2.l, v16.l, v16.h, 15 bitop3:0xec
; GFX1250-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v17.l
; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v0.h, 15 bitop3:0xec
-; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v1.l
-; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
; GFX1250-TRUE16-NEXT: v_bitop3_b16 v1.h, v2.l, v2.h, 0xff bitop3:0xec
-; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v0.h, 0xff bitop3:0xec
-; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX1250-TRUE16-NEXT: global_store_b32 v[0:1], v0, off
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-TRUE16-NEXT: v_bitop3_b16 v1.l, v0.l, v0.h, 0xff bitop3:0xec
+; GFX1250-TRUE16-NEXT: global_store_b32 v[0:1], v1, off
; GFX1250-TRUE16-NEXT: s_endpgm
;
; GFX1250-FAKE16-LABEL: amdgpu_cs_v32i1:
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 26f204f..14897b6 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -1771,33 +1771,29 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_b32 v4, v0, s[0:1]
+; GFX11-TRUE16-NEXT: global_load_b32 v5, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v4.l, 9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 9
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff00, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff00, v4.h
-; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte3_e32 v3, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 9
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v5.h, 9
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff00, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff00, v5.h
+; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte3_e32 v3, v5
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte2_e32 v2, v4
+; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte2_e32 v2, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte1_e32 v1, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x900, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x900, v0.h
-; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte1_e32 v1, v5
+; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte0_e32 v0, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x900, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x900, v4.h
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_store_b128 v6, v[0:3], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
index 9c59b42..ab96dcf 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
@@ -563,10 +563,9 @@ define i32 @divergent_vec_i16_HH(i32 %a, i32 %b) {
; GFX11-TRUE16-LABEL: divergent_vec_i16_HH:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: divergent_vec_i16_HH:
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
index 043bcc3..f64615d 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
@@ -264,3 +264,90 @@ body: |
$sgpr0 = COPY %16:sreg_32
SI_RETURN_TO_EPILOG $sgpr0
...
+
+---
+name: s_pack_ll_b32_b16
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_pack_ll_b32_b16
+ ; GCN: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 [[DEF]], implicit $exec
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_MOV_B32_e32_]].lo16, %subreg.lo16, [[DEF1]].lo16, %subreg.hi16
+ %0:sreg_32 = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:sreg_32 = COPY %1:vgpr_32
+ %3:sreg_32 = S_PACK_LL_B32_B16 %0:sreg_32, %2:sreg_32, implicit-def dead $scc
+...
+
+---
+name: s_pack_lh_b32_b16
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_pack_lh_b32_b16
+ ; GCN: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 [[DEF]], implicit $exec
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_MOV_B32_e32_]].lo16, %subreg.lo16, [[DEF1]].hi16, %subreg.hi16
+ %0:sreg_32 = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:sreg_32 = COPY %1:vgpr_32
+ %3:sreg_32 = S_PACK_LH_B32_B16 %0:sreg_32, %2:sreg_32, implicit-def dead $scc
+...
+
+---
+name: s_pack_hl_b32_b16
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_pack_hl_b32_b16
+ ; GCN: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 [[DEF]], implicit $exec
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_MOV_B32_e32_]].hi16, %subreg.lo16, [[DEF1]].lo16, %subreg.hi16
+ %0:sreg_32 = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:sreg_32 = COPY %1:vgpr_32
+ %3:sreg_32 = S_PACK_HL_B32_B16 %0:sreg_32, %2:sreg_32, implicit-def dead $scc
+...
+
+---
+name: s_pack_hh_b32_b16
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_pack_hh_b32_b16
+ ; GCN: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 [[DEF]], implicit $exec
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_MOV_B32_e32_]].hi16, %subreg.lo16, [[DEF1]].hi16, %subreg.hi16
+ %0:sreg_32 = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:sreg_32 = COPY %1:vgpr_32
+ %3:sreg_32 = S_PACK_HH_B32_B16 %0:sreg_32, %2:sreg_32, implicit-def dead $scc
+...
+
+---
+name: s_pack_ll_b32_b16_use_SALU16
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_pack_ll_b32_b16_use_SALU16
+ ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[V_FMAC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_FMAC_F16_t16_e64 0, [[DEF]].lo16, 0, [[DEF]].lo16, 0, [[DEF]].lo16, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_FMAC_F16_t16_e64_]], %subreg.lo16, [[DEF]].lo16, %subreg.hi16
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:sreg_32 = COPY %0:vgpr_32
+ %2:sreg_32 = S_FMAC_F16 %1:sreg_32, %1:sreg_32, %1:sreg_32, implicit $mode
+ %3:sreg_32 = S_PACK_LL_B32_B16 %2:sreg_32, %1:sreg_32, implicit-def dead $scc
+...
+
+---
+name: s_pack_ll_b32_b16_use_imm
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_pack_ll_b32_b16_use_imm
+ ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_MOV_B32_e32_]].lo16, %subreg.lo16, [[DEF]].lo16, %subreg.hi16
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:sreg_32 = COPY %0:vgpr_32
+ %2:sreg_32 = S_PACK_LL_B32_B16 1, %1:sreg_32, implicit-def dead $scc
+...
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index 9e15225..3145a27 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -10,7 +10,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
-define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -120,7 +120,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o
ret void
}
-define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_f32:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -231,7 +231,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt
ret void
}
-define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -342,7 +342,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1)
ret void
}
-define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -453,7 +453,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1)
ret void
}
-define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -569,7 +569,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp
ret void
}
-define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -740,7 +740,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1
ret void
}
-define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
; SI-SDAG-LABEL: v_test_fmed3_r_i_i_f64:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -955,14 +955,14 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out,
%outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
%a = load float, ptr addrspace(1) %gep0
- %max = call float @llvm.maxnum.f32(float %a, float 2.0)
- %med = call float @llvm.minnum.f32(float %max, float 4.0)
+ %max = call nnan float @llvm.maxnum.f32(float %a, float 2.0)
+ %med = call nnan float @llvm.minnum.f32(float %max, float 4.0)
store float %med, ptr addrspace(1) %outgep
ret void
}
-define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
; SI-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -1297,10 +1297,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
%a.fneg = fsub float -0.0, %a
- %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b)
- %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b)
- %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
- %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %a.fneg, float %b)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %a.fneg, float %b)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -1487,10 +1487,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
%b.fneg = fsub float -0.0, %b
- %tmp0 = call float @llvm.minnum.f32(float %a, float %b.fneg)
- %tmp1 = call float @llvm.maxnum.f32(float %a, float %b.fneg)
- %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
- %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b.fneg)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b.fneg)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -1677,10 +1677,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
%c.fneg = fsub float -0.0, %c
- %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
- %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
- %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fneg)
- %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c.fneg)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -1872,14 +1872,14 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs
%c = load volatile float, ptr addrspace(1) %gep2
%a.fneg = fsub float -0.0, %a
- %b.fabs = call float @llvm.fabs.f32(float %b)
- %c.fabs = call float @llvm.fabs.f32(float %c)
+ %b.fabs = call nnan float @llvm.fabs.f32(float %b)
+ %c.fabs = call nnan float @llvm.fabs.f32(float %c)
%c.fabs.fneg = fsub float -0.0, %c.fabs
- %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b.fabs)
- %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b.fabs)
- %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg)
- %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %a.fneg, float %b.fabs)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %a.fneg, float %b.fabs)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
store float %med3, ptr addrspace(1) %outgep
ret void
@@ -2082,16 +2082,16 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs
%c.fabs = call float @llvm.fabs.f32(float %c)
%c.fabs.fneg = fsub float -0.0, %c.fabs
- %tmp0 = call float @llvm.minnum.f32(float %a.fabs.fneg, float %b.fabs.fneg)
- %tmp1 = call float @llvm.maxnum.f32(float %a.fabs.fneg, float %b.fabs.fneg)
- %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg)
- %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %a.fabs.fneg, float %b.fabs.fneg)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %a.fabs.fneg, float %b.fabs.fneg)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
store float %med3, ptr addrspace(1) %outgep
ret void
}
-define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
; SI-SDAG-LABEL: v_nnan_inputs_med3_f32_pat0:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -2266,7 +2266,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt
ret void
}
-define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
; SI-SDAG-LABEL: v_nnan_input_calls_med3_f32_pat0:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -2418,7 +2418,7 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou
ret void
}
-define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
; SI-SDAG-LABEL: v_nnan_call_med3_f32_pat0:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -2570,7 +2570,7 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
ret void
}
-define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
; SI-SDAG-LABEL: v_fast_call_med3_f32_pat0:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -2878,10 +2878,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o
%a = load volatile float, ptr addrspace(1) %gep0
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
- %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
- %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
- %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
- %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -3030,10 +3030,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o
%a = load volatile float, ptr addrspace(1) %gep0
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
- %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
- %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
- %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
- %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -3220,10 +3220,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
%a.fneg = fsub float -0.0, %a
- %tmp0 = call float @llvm.maxnum.f32(float %a.fneg, float %b)
- %tmp1 = call float @llvm.minnum.f32(float %a.fneg, float %b)
- %tmp2 = call float @llvm.maxnum.f32(float %tmp1, float %c)
- %med3 = call float @llvm.minnum.f32(float %tmp0, float %tmp2)
+ %tmp0 = call nnan float @llvm.maxnum.f32(float %a.fneg, float %b)
+ %tmp1 = call nnan float @llvm.minnum.f32(float %a.fneg, float %b)
+ %tmp2 = call nnan float @llvm.maxnum.f32(float %tmp1, float %c)
+ %med3 = call nnan float @llvm.minnum.f32(float %tmp0, float %tmp2)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -3372,10 +3372,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o
%a = load volatile float, ptr addrspace(1) %gep0
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
- %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
- %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
- %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
- %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -3524,10 +3524,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o
%a = load volatile float, ptr addrspace(1) %gep0
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
- %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
- %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
- %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
- %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -3676,10 +3676,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o
%a = load volatile float, ptr addrspace(1) %gep0
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
- %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
- %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
- %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
- %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -3828,10 +3828,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o
%a = load volatile float, ptr addrspace(1) %gep0
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
- %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
- %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
- %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
- %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -3980,10 +3980,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o
%a = load volatile float, ptr addrspace(1) %gep0
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
- %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
- %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
- %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
- %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -4132,10 +4132,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o
%a = load volatile float, ptr addrspace(1) %gep0
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
- %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
- %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
- %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
- %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -4284,10 +4284,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o
%a = load volatile float, ptr addrspace(1) %gep0
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
- %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
- %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
- %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
- %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -4436,10 +4436,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o
%a = load volatile float, ptr addrspace(1) %gep0
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
- %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
- %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
- %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
- %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -4588,10 +4588,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) %
%a = load volatile float, ptr addrspace(1) %gep0
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
- %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
- %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
- %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
- %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -4740,10 +4740,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) %
%a = load volatile float, ptr addrspace(1) %gep0
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
- %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
- %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
- %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
- %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -4892,10 +4892,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) %
%a = load volatile float, ptr addrspace(1) %gep0
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
- %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
- %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
- %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
- %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -5044,10 +5044,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) %
%a = load volatile float, ptr addrspace(1) %gep0
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
- %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
- %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
- %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
- %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -5196,10 +5196,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) %
%a = load volatile float, ptr addrspace(1) %gep0
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
- %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
- %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
- %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
- %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -5348,10 +5348,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) %
%a = load volatile float, ptr addrspace(1) %gep0
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
- %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
- %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
- %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
- %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -5503,10 +5503,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %
%a = load volatile float, ptr addrspace(1) %gep0
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
- %tmp0 = call float @llvm.maxnum.f32(float %a, float %b)
- %tmp1 = call float @llvm.minnum.f32(float %a, float %b)
- %tmp2 = call float @llvm.maxnum.f32(float %tmp1, float %c)
- %med3 = call float @llvm.minnum.f32(float %tmp0, float %tmp2)
+ %tmp0 = call nnan float @llvm.maxnum.f32(float %a, float %b)
+ %tmp1 = call nnan float @llvm.minnum.f32(float %a, float %b)
+ %tmp2 = call nnan float @llvm.maxnum.f32(float %tmp1, float %c)
+ %med3 = call nnan float @llvm.minnum.f32(float %tmp0, float %tmp2)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -5515,7 +5515,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %
; Negative patterns
; ---------------------------------------------------------------------
-define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use0:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -5717,7 +5717,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1)
ret void
}
-define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use1:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -5944,7 +5944,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1)
ret void
}
-define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use2:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -6146,7 +6146,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1)
ret void
}
-define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -6352,7 +6352,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr
ret void
}
-define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
; SI-SDAG-LABEL: v_nnan_inputs_missing0_med3_f32_pat0:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -6527,7 +6527,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1)
ret void
}
-define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
; SI-SDAG-LABEL: v_nnan_inputs_missing1_med3_f32_pat0:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -6702,7 +6702,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1)
ret void
}
-define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
; SI-SDAG-LABEL: v_nnan_inputs_missing2_med3_f32_pat0:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -6877,7 +6877,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1)
ret void
}
-define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
; SI-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -7270,10 +7270,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
%a.fneg = fsub float -0.0, %a
- %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b)
- %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
- %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
- %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %a.fneg, float %b)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c)
+ %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
store float %med3, ptr addrspace(1) %outgep
ret void
}
@@ -7428,13 +7428,13 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out
%a = load volatile float, ptr addrspace(1) %gep0
%b = load volatile float, ptr addrspace(1) %gep1
%c = load volatile float, ptr addrspace(1) %gep2
- %max = call float @llvm.maxnum.f32(float %a, float %b)
- %minmax = call float @llvm.minnum.f32(float %max, float %c)
+ %max = call nnan float @llvm.maxnum.f32(float %a, float %b)
+ %minmax = call nnan float @llvm.minnum.f32(float %max, float %c)
store float %minmax, ptr addrspace(1) %outgep
ret void
}
-define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f16:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -7597,7 +7597,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o
ret void
}
-define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
; SI-SDAG-LABEL: v_nnan_inputs_med3_f16_pat0:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -7865,7 +7865,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt
ret void
}
-define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
; SI-SDAG-LABEL: two_non_inline_constant:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -7998,7 +7998,7 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad
}
; FIXME: Simple stores do not work as a multiple use because they are bitcasted to integer constants.
-define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
; SI-SDAG-LABEL: one_non_inline_constant:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -8137,7 +8137,7 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad
ret void
}
-define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
; SI-SDAG-LABEL: two_non_inline_constant_multi_use:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -8343,7 +8343,7 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o
ret void
}
-define float @v_test_fmed3_r_i_i_f32_minimumnum_maximumnum(float %a) #1 {
+define float @v_test_fmed3_r_i_i_f32_minimumnum_maximumnum(float %a) {
; SI-LABEL: v_test_fmed3_r_i_i_f32_minimumnum_maximumnum:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8384,7 +8384,7 @@ define float @v_test_fmed3_r_i_i_f32_minimumnum_maximumnum(float %a) #1 {
ret float %med
}
-define <2 x float> @v_test_fmed3_r_i_i_v2f32_minimumnum_maximumnum(<2 x float> %a) #1 {
+define <2 x float> @v_test_fmed3_r_i_i_v2f32_minimumnum_maximumnum(<2 x float> %a) {
; SI-SDAG-LABEL: v_test_fmed3_r_i_i_v2f32_minimumnum_maximumnum:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8452,7 +8452,7 @@ define <2 x float> @v_test_fmed3_r_i_i_v2f32_minimumnum_maximumnum(<2 x float> %
ret <2 x float> %med
}
-define { float, float } @v_test_fmed3_r_i_i_f32_minimumnum_maximumnum_multi_use(float %a) #1 {
+define { float, float } @v_test_fmed3_r_i_i_f32_minimumnum_maximumnum_multi_use(float %a) {
; SI-SDAG-LABEL: v_test_fmed3_r_i_i_f32_minimumnum_maximumnum_multi_use:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8525,7 +8525,7 @@ define { float, float } @v_test_fmed3_r_i_i_f32_minimumnum_maximumnum_multi_use(
ret { float, float } %ins.1
}
-define float @v_test_nnan_input_fmed3_r_i_i_f32_minimumnum_maximumnum(float %a) #1 {
+define float @v_test_nnan_input_fmed3_r_i_i_f32_minimumnum_maximumnum(float %a) {
; SI-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_minimumnum_maximumnum:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8567,7 +8567,7 @@ define float @v_test_nnan_input_fmed3_r_i_i_f32_minimumnum_maximumnum(float %a)
ret float %med
}
-define float @v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minimumnum(float %a) #1 {
+define float @v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minimumnum(float %a) {
; SI-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minimumnum:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8609,7 +8609,7 @@ define float @v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minimumnum(float %a)
ret float %med
}
-define float @v_test_nnan_input_fmed3_r_i_i_f32_maxnum_minimumnum(float %a) #1 {
+define float @v_test_nnan_input_fmed3_r_i_i_f32_maxnum_minimumnum(float %a) {
; SI-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maxnum_minimumnum:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8651,7 +8651,7 @@ define float @v_test_nnan_input_fmed3_r_i_i_f32_maxnum_minimumnum(float %a) #1 {
ret float %med
}
-define float @v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minnum(float %a) #1 {
+define float @v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minnum(float %a) {
; SI-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minnum:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8693,7 +8693,7 @@ define float @v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minnum(float %a) #1 {
ret float %med
}
-define half @v_test_fmed3_r_i_i_f16_minimumnum_maximumnum(half %a) #1 {
+define half @v_test_fmed3_r_i_i_f16_minimumnum_maximumnum(half %a) {
; SI-SDAG-LABEL: v_test_fmed3_r_i_i_f16_minimumnum_maximumnum:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8772,7 +8772,7 @@ define half @v_test_fmed3_r_i_i_f16_minimumnum_maximumnum(half %a) #1 {
ret half %med
}
-define <2 x half> @v_test_fmed3_r_i_i_v2f16_minimumnum_maximumnum(<2 x half> %a) #1 {
+define <2 x half> @v_test_fmed3_r_i_i_v2f16_minimumnum_maximumnum(<2 x half> %a) {
; SI-SDAG-LABEL: v_test_fmed3_r_i_i_v2f16_minimumnum_maximumnum:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8848,7 +8848,7 @@ define <2 x half> @v_test_fmed3_r_i_i_v2f16_minimumnum_maximumnum(<2 x half> %a)
ret <2 x half> %med
}
-define double @v_test_fmed3_r_i_i_f64_minimumnum_maximumnum(double %a) #1 {
+define double @v_test_fmed3_r_i_i_f64_minimumnum_maximumnum(double %a) {
; SI-LABEL: v_test_fmed3_r_i_i_f64_minimumnum_maximumnum:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8905,5 +8905,4 @@ declare half @llvm.minnum.f16(half, half) #0
declare half @llvm.maxnum.f16(half, half) #0
attributes #0 = { nounwind readnone }
-attributes #1 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="false" }
attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" }
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
index 76da0aa..10c60df 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
@@ -478,41 +478,76 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_non_bc_src(ptr addrspace(1) %out,
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: s_fneg_fabs_v2bf16_non_bc_src:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshl_b32 s1, s0, 16
-; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000
-; GFX11-NEXT: v_add_f32_e64 v0, s1, 1.0
-; GFX11-NEXT: v_add_f32_e64 v1, s0, 2.0
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_or_b32_e32 v0, 0x80008000, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: s_fneg_fabs_v2bf16_non_bc_src:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff0000
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, s1, 2.0
+; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, s0, 1.0
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, 0x80008000, v1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_fneg_fabs_v2bf16_non_bc_src:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff0000
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, s1, 1.0
+; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, s0, 2.0
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, 0x80008000, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%add = fadd <2 x bfloat> %in, <bfloat 1.0, bfloat 2.0>
%fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %add)
%fneg.fabs = fsub <2 x bfloat> <bfloat -0.0, bfloat -0.0>, %fabs
@@ -752,42 +787,78 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2bf16(ptr addrspace(1) %out, <2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: fold_user_fneg_fabs_v2bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s1, s0, 0x7fff
-; GFX11-NEXT: s_lshr_b32 s0, s0, 16
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-NEXT: s_and_b32 s0, s0, 0x7fff
-; GFX11-NEXT: v_mul_f32_e64 v0, s1, -4.0
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mul_f32_e64 v1, s0, -4.0
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fold_user_fneg_fabs_v2bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s0, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0x7fff
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: v_mul_f32_e64 v1, s0, -4.0
+; GFX11-TRUE16-NEXT: v_mul_f32_e64 v0, s1, -4.0
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_store_b32 v2, v1, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fold_user_fneg_fabs_v2bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s0, 0x7fff
+; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX11-FAKE16-NEXT: v_mul_f32_e64 v0, s1, -4.0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_mul_f32_e64 v1, s0, -4.0
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %in)
%fneg.fabs = fsub <2 x bfloat> <bfloat -0.0, bfloat -0.0>, %fabs
%mul = fmul <2 x bfloat> %fneg.fabs, <bfloat 4.0, bfloat 4.0>
@@ -975,46 +1046,88 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2bf16(ptr addrspa
; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x10
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s0, s6, 0x7fff
-; GFX11-NEXT: s_lshr_b32 s1, s6, 16
-; GFX11-NEXT: s_lshl_b32 s0, s0, 16
-; GFX11-NEXT: s_and_b32 s1, s1, 0x7fff
-; GFX11-NEXT: v_mul_f32_e64 v0, s0, -4.0
-; GFX11-NEXT: s_lshl_b32 s0, s1, 16
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mul_f32_e64 v1, s0, -4.0
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX11-NEXT: s_and_b32 s4, s6, 0x7fff7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
-; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_and_b32 v0, 0xffff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b32 v2, v3, s[0:1]
-; GFX11-NEXT: global_store_b32 v2, v0, s[2:3]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x10
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s6, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s6, 0x7fff
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_mul_f32_e64 v0, s0, -4.0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s1, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mul_f32_e64 v1, s0, -4.0
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s6, 0x7fff7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: global_store_b32 v2, v3, s[0:1]
+; GFX11-TRUE16-NEXT: global_store_b32 v2, v1, s[2:3]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x10
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s6, 0x7fff
+; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s6, 16
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0x7fff
+; GFX11-FAKE16-NEXT: v_mul_f32_e64 v0, s0, -4.0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s1, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_mul_f32_e64 v1, s0, -4.0
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s6, 0x7fff7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_and_b32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: global_store_b32 v2, v3, s[0:1]
+; GFX11-FAKE16-NEXT: global_store_b32 v2, v0, s[2:3]
+; GFX11-FAKE16-NEXT: s_endpgm
%fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %in)
%fneg = fsub <2 x bfloat> <bfloat -0.0, bfloat -0.0>, %fabs
%mul = fmul <2 x bfloat> %fneg, <bfloat 4.0, bfloat 4.0>
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll
index 98044a7..84b904f 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll
@@ -712,47 +712,88 @@ define amdgpu_kernel void @v_fneg_fold_v2bf16(ptr addrspace(1) %out, ptr addrspa
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: v_fneg_fold_v2bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: v_xor_b32_e32 v3, 0x8000, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_xor_b32_e32 v2, 0x8000, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mul_f32 v3, v3, v4 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-NEXT: v_mul_f32_e32 v1, v2, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v2, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_fneg_fold_v2bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_xor_b32_e32 v2, 0x8000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mul_f32 v1, v3, v1 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
+; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_fneg_fold_v2bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_xor_b32_e32 v2, 0x8000, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_mul_f32 v3, v3, v4 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v1, 16, v2
+; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%val = load <2 x bfloat>, ptr addrspace(1) %in
%fsub = fsub <2 x bfloat> <bfloat -0.0, bfloat -0.0>, %val
%fmul = fmul <2 x bfloat> %fsub, %val
diff --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
index f048dc5..a43292d 100644
--- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
@@ -330,11 +330,8 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i16(
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-TRUE16-NEXT: v_cvt_i16_f16_e32 v0.l, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cvt_i16_f16_e32 v1.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cvt_i16_f16_e32 v0.h, v1.l
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
index 96abb3a..96cb621 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
@@ -329,11 +329,8 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i16(
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-TRUE16-NEXT: v_cvt_u16_f16_e32 v0.l, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cvt_u16_f16_e32 v1.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cvt_u16_f16_e32 v0.h, v1.l
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 78a961e..35d178c 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -4858,7 +4858,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: s_and_b64 vcc, exec, s[2:3]
; SI-NEXT: v_cvt_f16_f32_e32 v4, v2
; SI-NEXT: s_cbranch_vccz .LBB9_2
-; SI-NEXT: ; %bb.1: ; %frem.else
+; SI-NEXT: ; %bb.1: ; %frem.else20
; SI-NEXT: v_bfi_b32 v7, s0, 0, v2
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
; SI-NEXT: v_cmp_eq_f32_e32 vcc, v5, v6
@@ -4869,7 +4869,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: .LBB9_2:
; SI-NEXT: ; implicit-def: $vgpr4
; SI-NEXT: s_mov_b64 vcc, 0
-; SI-NEXT: .LBB9_3: ; %frem.compute
+; SI-NEXT: .LBB9_3: ; %frem.compute19
; SI-NEXT: s_mov_b32 s3, 0x7f800000
; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s3
; SI-NEXT: v_frexp_exp_i32_f32_e32 v4, v5
@@ -4905,10 +4905,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0
; SI-NEXT: s_cmp_lt_i32 s1, 12
; SI-NEXT: s_cbranch_scc1 .LBB9_7
-; SI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; SI-NEXT: ; %bb.4: ; %frem.loop_body27.preheader
; SI-NEXT: s_sub_i32 s1, s2, s3
; SI-NEXT: s_add_i32 s1, s1, 11
-; SI-NEXT: .LBB9_5: ; %frem.loop_body
+; SI-NEXT: .LBB9_5: ; %frem.loop_body27
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: v_mov_b32_e32 v7, v5
; SI-NEXT: v_mul_f32_e32 v5, v7, v6
@@ -4923,7 +4923,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: s_cbranch_scc1 .LBB9_5
; SI-NEXT: ; %bb.6: ; %Flow55
; SI-NEXT: v_mov_b32_e32 v5, v7
-; SI-NEXT: .LBB9_7: ; %frem.loop_exit
+; SI-NEXT: .LBB9_7: ; %frem.loop_exit28
; SI-NEXT: s_add_i32 s1, s1, -10
; SI-NEXT: v_ldexp_f32_e64 v5, v5, s1
; SI-NEXT: v_mul_f32_e32 v6, v5, v6
@@ -4944,7 +4944,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_cvt_f32_f16_e64 v7, |v7|
; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v6, v7
; SI-NEXT: s_cbranch_vccz .LBB9_10
-; SI-NEXT: ; %bb.9: ; %frem.else20
+; SI-NEXT: ; %bb.9: ; %frem.else
; SI-NEXT: s_brev_b32 s0, -2
; SI-NEXT: v_bfi_b32 v8, s0, 0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
@@ -4956,7 +4956,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: .LBB9_10:
; SI-NEXT: ; implicit-def: $vgpr5
; SI-NEXT: s_mov_b64 vcc, 0
-; SI-NEXT: .LBB9_11: ; %frem.compute19
+; SI-NEXT: .LBB9_11: ; %frem.compute
; SI-NEXT: s_mov_b32 s3, 0x7f800000
; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v6|, s3
; SI-NEXT: v_frexp_exp_i32_f32_e32 v5, v6
@@ -4992,10 +4992,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0
; SI-NEXT: s_cmp_lt_i32 s1, 12
; SI-NEXT: s_cbranch_scc1 .LBB9_15
-; SI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; SI-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; SI-NEXT: s_sub_i32 s1, s2, s3
; SI-NEXT: s_add_i32 s1, s1, 11
-; SI-NEXT: .LBB9_13: ; %frem.loop_body27
+; SI-NEXT: .LBB9_13: ; %frem.loop_body
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: v_mov_b32_e32 v8, v6
; SI-NEXT: v_mul_f32_e32 v6, v8, v7
@@ -5010,7 +5010,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: s_cbranch_scc1 .LBB9_13
; SI-NEXT: ; %bb.14: ; %Flow
; SI-NEXT: v_mov_b32_e32 v6, v8
-; SI-NEXT: .LBB9_15: ; %frem.loop_exit28
+; SI-NEXT: .LBB9_15: ; %frem.loop_exit
; SI-NEXT: s_add_i32 s1, s1, -10
; SI-NEXT: v_ldexp_f32_e64 v6, v6, s1
; SI-NEXT: v_mul_f32_e32 v7, v6, v7
@@ -5084,7 +5084,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_and_b32_e32 v5, 0x7fffffff, v3
; CI-NEXT: s_and_b64 vcc, exec, s[2:3]
; CI-NEXT: s_cbranch_vccz .LBB9_2
-; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: ; %bb.1: ; %frem.else20
; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
; CI-NEXT: v_bfi_b32 v7, s0, 0, v2
; CI-NEXT: v_cmp_eq_f32_e32 vcc, v6, v5
@@ -5093,7 +5093,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB9_8
; CI-NEXT: .LBB9_2:
; CI-NEXT: ; implicit-def: $vgpr4
-; CI-NEXT: .LBB9_3: ; %frem.compute
+; CI-NEXT: .LBB9_3: ; %frem.compute19
; CI-NEXT: v_frexp_exp_i32_f32_e32 v9, v6
; CI-NEXT: v_frexp_mant_f32_e32 v4, v6
; CI-NEXT: v_frexp_mant_f32_e32 v6, v5
@@ -5118,10 +5118,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v6
; CI-NEXT: v_div_fixup_f32 v8, v8, v5, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB9_7
-; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: ; %bb.4: ; %frem.loop_body27.preheader
; CI-NEXT: v_sub_i32_e32 v6, vcc, v9, v10
; CI-NEXT: v_add_i32_e32 v6, vcc, 11, v6
-; CI-NEXT: .LBB9_5: ; %frem.loop_body
+; CI-NEXT: .LBB9_5: ; %frem.loop_body27
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v9, v7
; CI-NEXT: v_mul_f32_e32 v7, v9, v8
@@ -5136,7 +5136,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_cbranch_vccnz .LBB9_5
; CI-NEXT: ; %bb.6: ; %Flow55
; CI-NEXT: v_mov_b32_e32 v7, v9
-; CI-NEXT: .LBB9_7: ; %frem.loop_exit
+; CI-NEXT: .LBB9_7: ; %frem.loop_exit28
; CI-NEXT: v_add_i32_e32 v6, vcc, -10, v6
; CI-NEXT: v_ldexp_f32_e32 v6, v7, v6
; CI-NEXT: v_mul_f32_e32 v7, v6, v8
@@ -5157,7 +5157,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cvt_f32_f16_e64 v6, |v6|
; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v7, v6
; CI-NEXT: s_cbranch_vccz .LBB9_10
-; CI-NEXT: ; %bb.9: ; %frem.else20
+; CI-NEXT: ; %bb.9: ; %frem.else
; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
; CI-NEXT: s_brev_b32 s0, -2
; CI-NEXT: v_bfi_b32 v8, s0, 0, v0
@@ -5167,7 +5167,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB9_16
; CI-NEXT: .LBB9_10:
; CI-NEXT: ; implicit-def: $vgpr5
-; CI-NEXT: .LBB9_11: ; %frem.compute19
+; CI-NEXT: .LBB9_11: ; %frem.compute
; CI-NEXT: v_frexp_exp_i32_f32_e32 v10, v7
; CI-NEXT: v_frexp_mant_f32_e32 v5, v7
; CI-NEXT: v_frexp_mant_f32_e32 v7, v6
@@ -5192,10 +5192,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v7
; CI-NEXT: v_div_fixup_f32 v9, v9, v6, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB9_15
-; CI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; CI-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; CI-NEXT: v_sub_i32_e32 v7, vcc, v10, v11
; CI-NEXT: v_add_i32_e32 v7, vcc, 11, v7
-; CI-NEXT: .LBB9_13: ; %frem.loop_body27
+; CI-NEXT: .LBB9_13: ; %frem.loop_body
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v10, v8
; CI-NEXT: v_mul_f32_e32 v8, v10, v9
@@ -5210,7 +5210,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_cbranch_vccnz .LBB9_13
; CI-NEXT: ; %bb.14: ; %Flow
; CI-NEXT: v_mov_b32_e32 v8, v10
-; CI-NEXT: .LBB9_15: ; %frem.loop_exit28
+; CI-NEXT: .LBB9_15: ; %frem.loop_exit
; CI-NEXT: v_add_i32_e32 v7, vcc, -10, v7
; CI-NEXT: v_ldexp_f32_e32 v7, v8, v7
; CI-NEXT: v_mul_f32_e32 v8, v7, v9
@@ -5275,7 +5275,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cvt_f32_f16_e64 v3, |v1|
; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v3
; VI-NEXT: s_cbranch_vccz .LBB9_2
-; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: ; %bb.1: ; %frem.else20
; VI-NEXT: s_movk_i32 s2, 0x7fff
; VI-NEXT: v_bfi_b32 v2, s2, 0, v0
; VI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v3
@@ -5284,7 +5284,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB9_8
; VI-NEXT: .LBB9_2:
; VI-NEXT: ; implicit-def: $vgpr2
-; VI-NEXT: .LBB9_3: ; %frem.compute
+; VI-NEXT: .LBB9_3: ; %frem.compute19
; VI-NEXT: v_frexp_exp_i32_f32_e32 v7, v4
; VI-NEXT: v_frexp_mant_f32_e32 v2, v4
; VI-NEXT: v_frexp_mant_f32_e32 v4, v3
@@ -5309,10 +5309,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v4
; VI-NEXT: v_div_fixup_f32 v6, v6, v3, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB9_7
-; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: ; %bb.4: ; %frem.loop_body27.preheader
; VI-NEXT: v_sub_u32_e32 v4, vcc, v7, v8
; VI-NEXT: v_add_u32_e32 v4, vcc, 11, v4
-; VI-NEXT: .LBB9_5: ; %frem.loop_body
+; VI-NEXT: .LBB9_5: ; %frem.loop_body27
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v7, v5
; VI-NEXT: v_mul_f32_e32 v5, v7, v6
@@ -5327,7 +5327,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_cbranch_vccnz .LBB9_5
; VI-NEXT: ; %bb.6: ; %Flow55
; VI-NEXT: v_mov_b32_e32 v5, v7
-; VI-NEXT: .LBB9_7: ; %frem.loop_exit
+; VI-NEXT: .LBB9_7: ; %frem.loop_exit28
; VI-NEXT: v_add_u32_e32 v4, vcc, -10, v4
; VI-NEXT: v_ldexp_f32 v4, v5, v4
; VI-NEXT: v_mul_f32_e32 v5, v4, v6
@@ -5347,7 +5347,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cvt_f32_f16_e64 v6, |v4|
; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v7, v6
; VI-NEXT: s_cbranch_vccz .LBB9_10
-; VI-NEXT: ; %bb.9: ; %frem.else20
+; VI-NEXT: ; %bb.9: ; %frem.else
; VI-NEXT: s_movk_i32 s2, 0x7fff
; VI-NEXT: v_bfi_b32 v5, s2, 0, v3
; VI-NEXT: v_cmp_eq_f32_e32 vcc, v7, v6
@@ -5356,7 +5356,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB9_16
; VI-NEXT: .LBB9_10:
; VI-NEXT: ; implicit-def: $vgpr5
-; VI-NEXT: .LBB9_11: ; %frem.compute19
+; VI-NEXT: .LBB9_11: ; %frem.compute
; VI-NEXT: v_frexp_exp_i32_f32_e32 v10, v7
; VI-NEXT: v_frexp_mant_f32_e32 v5, v7
; VI-NEXT: v_frexp_mant_f32_e32 v7, v6
@@ -5381,10 +5381,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v7
; VI-NEXT: v_div_fixup_f32 v9, v9, v6, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB9_15
-; VI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; VI-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; VI-NEXT: v_sub_u32_e32 v7, vcc, v10, v11
; VI-NEXT: v_add_u32_e32 v7, vcc, 11, v7
-; VI-NEXT: .LBB9_13: ; %frem.loop_body27
+; VI-NEXT: .LBB9_13: ; %frem.loop_body
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v10, v8
; VI-NEXT: v_mul_f32_e32 v8, v10, v9
@@ -5399,7 +5399,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_cbranch_vccnz .LBB9_13
; VI-NEXT: ; %bb.14: ; %Flow
; VI-NEXT: v_mov_b32_e32 v8, v10
-; VI-NEXT: .LBB9_15: ; %frem.loop_exit28
+; VI-NEXT: .LBB9_15: ; %frem.loop_exit
; VI-NEXT: v_add_u32_e32 v7, vcc, -10, v7
; VI-NEXT: v_ldexp_f32 v7, v8, v7
; VI-NEXT: v_mul_f32_e32 v8, v7, v9
@@ -5443,7 +5443,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cvt_f32_f16_e64 v3, |v0|
; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v3
; GFX9-NEXT: s_cbranch_vccz .LBB9_2
-; GFX9-NEXT: ; %bb.1: ; %frem.else
+; GFX9-NEXT: ; %bb.1: ; %frem.else20
; GFX9-NEXT: s_movk_i32 s2, 0x7fff
; GFX9-NEXT: v_bfi_b32 v2, s2, 0, v1
; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v4, v3
@@ -5452,7 +5452,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_branch .LBB9_8
; GFX9-NEXT: .LBB9_2:
; GFX9-NEXT: ; implicit-def: $vgpr2
-; GFX9-NEXT: .LBB9_3: ; %frem.compute
+; GFX9-NEXT: .LBB9_3: ; %frem.compute19
; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v7, v4
; GFX9-NEXT: v_frexp_mant_f32_e32 v2, v4
; GFX9-NEXT: v_frexp_mant_f32_e32 v4, v3
@@ -5477,10 +5477,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 12, v4
; GFX9-NEXT: v_div_fixup_f32 v6, v6, v3, 1.0
; GFX9-NEXT: s_cbranch_vccnz .LBB9_7
-; GFX9-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX9-NEXT: ; %bb.4: ; %frem.loop_body27.preheader
; GFX9-NEXT: v_sub_u32_e32 v4, v7, v8
; GFX9-NEXT: v_add_u32_e32 v4, 11, v4
-; GFX9-NEXT: .LBB9_5: ; %frem.loop_body
+; GFX9-NEXT: .LBB9_5: ; %frem.loop_body27
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v7, v5
; GFX9-NEXT: v_mul_f32_e32 v5, v7, v6
@@ -5495,7 +5495,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_cbranch_vccnz .LBB9_5
; GFX9-NEXT: ; %bb.6: ; %Flow55
; GFX9-NEXT: v_mov_b32_e32 v5, v7
-; GFX9-NEXT: .LBB9_7: ; %frem.loop_exit
+; GFX9-NEXT: .LBB9_7: ; %frem.loop_exit28
; GFX9-NEXT: v_add_u32_e32 v4, -10, v4
; GFX9-NEXT: v_ldexp_f32 v4, v5, v4
; GFX9-NEXT: v_mul_f32_e32 v5, v4, v6
@@ -5514,7 +5514,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cvt_f32_f16_sdwa v5, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v6, v5
; GFX9-NEXT: s_cbranch_vccz .LBB9_10
-; GFX9-NEXT: ; %bb.9: ; %frem.else20
+; GFX9-NEXT: ; %bb.9: ; %frem.else
; GFX9-NEXT: s_movk_i32 s2, 0x7fff
; GFX9-NEXT: v_bfi_b32 v4, s2, 0, v3
; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v6, v5
@@ -5523,7 +5523,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_branch .LBB9_16
; GFX9-NEXT: .LBB9_10:
; GFX9-NEXT: ; implicit-def: $vgpr4
-; GFX9-NEXT: .LBB9_11: ; %frem.compute19
+; GFX9-NEXT: .LBB9_11: ; %frem.compute
; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v9, v6
; GFX9-NEXT: v_frexp_mant_f32_e32 v4, v6
; GFX9-NEXT: v_frexp_mant_f32_e32 v6, v5
@@ -5548,10 +5548,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 12, v6
; GFX9-NEXT: v_div_fixup_f32 v8, v8, v5, 1.0
; GFX9-NEXT: s_cbranch_vccnz .LBB9_15
-; GFX9-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; GFX9-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; GFX9-NEXT: v_sub_u32_e32 v6, v9, v10
; GFX9-NEXT: v_add_u32_e32 v6, 11, v6
-; GFX9-NEXT: .LBB9_13: ; %frem.loop_body27
+; GFX9-NEXT: .LBB9_13: ; %frem.loop_body
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v9, v7
; GFX9-NEXT: v_mul_f32_e32 v7, v9, v8
@@ -5566,7 +5566,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_cbranch_vccnz .LBB9_13
; GFX9-NEXT: ; %bb.14: ; %Flow
; GFX9-NEXT: v_mov_b32_e32 v7, v9
-; GFX9-NEXT: .LBB9_15: ; %frem.loop_exit28
+; GFX9-NEXT: .LBB9_15: ; %frem.loop_exit
; GFX9-NEXT: v_add_u32_e32 v6, -10, v6
; GFX9-NEXT: v_ldexp_f32 v6, v7, v6
; GFX9-NEXT: v_mul_f32_e32 v7, v6, v8
@@ -5612,7 +5612,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cvt_f32_f16_e64 v3, |v0|
; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v4, v3
; GFX10-NEXT: s_cbranch_vccz .LBB9_2
-; GFX10-NEXT: ; %bb.1: ; %frem.else
+; GFX10-NEXT: ; %bb.1: ; %frem.else20
; GFX10-NEXT: v_bfi_b32 v2, 0x7fff, 0, v1
; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v4, v3
; GFX10-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc_lo
@@ -5620,7 +5620,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: s_branch .LBB9_8
; GFX10-NEXT: .LBB9_2:
; GFX10-NEXT: ; implicit-def: $vgpr2
-; GFX10-NEXT: .LBB9_3: ; %frem.compute
+; GFX10-NEXT: .LBB9_3: ; %frem.compute19
; GFX10-NEXT: v_frexp_mant_f32_e32 v2, v4
; GFX10-NEXT: v_frexp_mant_f32_e32 v6, v3
; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v5, v4
@@ -5647,10 +5647,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v6
; GFX10-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0
; GFX10-NEXT: s_cbranch_vccnz .LBB9_7
-; GFX10-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX10-NEXT: ; %bb.4: ; %frem.loop_body27.preheader
; GFX10-NEXT: s_sub_i32 s2, s2, s3
; GFX10-NEXT: s_add_i32 s2, s2, 11
-; GFX10-NEXT: .LBB9_5: ; %frem.loop_body
+; GFX10-NEXT: .LBB9_5: ; %frem.loop_body27
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_mov_b32_e32 v7, v4
; GFX10-NEXT: s_add_i32 s2, s2, -11
@@ -5666,7 +5666,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: ; %bb.6: ; %Flow55
; GFX10-NEXT: v_mov_b32_e32 v6, s2
; GFX10-NEXT: v_mov_b32_e32 v4, v7
-; GFX10-NEXT: .LBB9_7: ; %frem.loop_exit
+; GFX10-NEXT: .LBB9_7: ; %frem.loop_exit28
; GFX10-NEXT: v_add_nc_u32_e32 v6, -10, v6
; GFX10-NEXT: v_ldexp_f32 v4, v4, v6
; GFX10-NEXT: v_mul_f32_e32 v5, v4, v5
@@ -5684,7 +5684,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cvt_f32_f16_e64 v6, |v3|
; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v6, v4
; GFX10-NEXT: s_cbranch_vccz .LBB9_10
-; GFX10-NEXT: ; %bb.9: ; %frem.else20
+; GFX10-NEXT: ; %bb.9: ; %frem.else
; GFX10-NEXT: v_bfi_b32 v5, 0x7fff, 0, v3
; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v6, v4
; GFX10-NEXT: v_cndmask_b32_e32 v5, v3, v5, vcc_lo
@@ -5692,7 +5692,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: s_branch .LBB9_16
; GFX10-NEXT: .LBB9_10:
; GFX10-NEXT: ; implicit-def: $vgpr5
-; GFX10-NEXT: .LBB9_11: ; %frem.compute19
+; GFX10-NEXT: .LBB9_11: ; %frem.compute
; GFX10-NEXT: v_frexp_mant_f32_e32 v5, v6
; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v7, v6
; GFX10-NEXT: v_ldexp_f32 v6, v5, 11
@@ -5719,10 +5719,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v8
; GFX10-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0
; GFX10-NEXT: s_cbranch_vccnz .LBB9_15
-; GFX10-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; GFX10-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; GFX10-NEXT: s_sub_i32 s2, s2, s3
; GFX10-NEXT: s_add_i32 s2, s2, 11
-; GFX10-NEXT: .LBB9_13: ; %frem.loop_body27
+; GFX10-NEXT: .LBB9_13: ; %frem.loop_body
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_mov_b32_e32 v9, v6
; GFX10-NEXT: s_add_i32 s2, s2, -11
@@ -5738,7 +5738,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: ; %bb.14: ; %Flow
; GFX10-NEXT: v_mov_b32_e32 v8, s2
; GFX10-NEXT: v_mov_b32_e32 v6, v9
-; GFX10-NEXT: .LBB9_15: ; %frem.loop_exit28
+; GFX10-NEXT: .LBB9_15: ; %frem.loop_exit
; GFX10-NEXT: v_add_nc_u32_e32 v8, -10, v8
; GFX10-NEXT: v_ldexp_f32 v6, v6, v8
; GFX10-NEXT: v_mul_f32_e32 v7, v6, v7
@@ -5782,7 +5782,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v4, v3
; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB9_2
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %frem.else
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %frem.else20
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v4, v3
@@ -5793,7 +5793,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_branch .LBB9_8
; GFX11-TRUE16-NEXT: .LBB9_2:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2
-; GFX11-TRUE16-NEXT: .LBB9_3: ; %frem.compute
+; GFX11-TRUE16-NEXT: .LBB9_3: ; %frem.compute19
; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v2, v4
; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v6, v3
; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v5, v4
@@ -5829,11 +5829,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB9_7
-; GFX11-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX11-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body27.preheader
; GFX11-TRUE16-NEXT: s_sub_i32 s2, s2, s3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 11
-; GFX11-TRUE16-NEXT: .LBB9_5: ; %frem.loop_body
+; GFX11-TRUE16-NEXT: .LBB9_5: ; %frem.loop_body27
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v4
@@ -5853,7 +5853,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: ; %bb.6: ; %Flow55
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s2
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v7
-; GFX11-TRUE16-NEXT: .LBB9_7: ; %frem.loop_exit
+; GFX11-TRUE16-NEXT: .LBB9_7: ; %frem.loop_exit28
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, -10, v6
; GFX11-TRUE16-NEXT: v_ldexp_f32 v4, v4, v6
@@ -5880,7 +5880,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v6, v5
; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB9_10
-; GFX11-TRUE16-NEXT: ; %bb.9: ; %frem.else20
+; GFX11-TRUE16-NEXT: ; %bb.9: ; %frem.else
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, 0
; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v6, v5
@@ -5891,7 +5891,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_branch .LBB9_16
; GFX11-TRUE16-NEXT: .LBB9_10:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7
-; GFX11-TRUE16-NEXT: .LBB9_11: ; %frem.compute19
+; GFX11-TRUE16-NEXT: .LBB9_11: ; %frem.compute
; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v8, v6
; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v6, v6
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -5927,11 +5927,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_div_fixup_f32 v8, v8, v6, 1.0
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB9_15
-; GFX11-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; GFX11-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; GFX11-TRUE16-NEXT: s_sub_i32 s2, s2, s3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 11
-; GFX11-TRUE16-NEXT: .LBB9_13: ; %frem.loop_body27
+; GFX11-TRUE16-NEXT: .LBB9_13: ; %frem.loop_body
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v10, v7
@@ -5951,7 +5951,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: ; %bb.14: ; %Flow
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s2
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v10
-; GFX11-TRUE16-NEXT: .LBB9_15: ; %frem.loop_exit28
+; GFX11-TRUE16-NEXT: .LBB9_15: ; %frem.loop_exit
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, -10, v9
; GFX11-TRUE16-NEXT: v_ldexp_f32 v7, v7, v9
@@ -5972,16 +5972,14 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: .LBB9_16: ; %Flow54
; GFX11-TRUE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v1.l
; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e64 s2, 0x7c00, |v0.l|
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v4.l
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v2.l, s2
; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e64 s2, 0x7c00, |v3.l|
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, 0x7e00, v7.l, s2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v2, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v7.l, s2
; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
@@ -6002,7 +6000,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v4, v3
; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB9_2
-; GFX11-FAKE16-NEXT: ; %bb.1: ; %frem.else
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %frem.else20
; GFX11-FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, 0, v0
; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v4, v3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
@@ -6011,7 +6009,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: s_branch .LBB9_8
; GFX11-FAKE16-NEXT: .LBB9_2:
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr2
-; GFX11-FAKE16-NEXT: .LBB9_3: ; %frem.compute
+; GFX11-FAKE16-NEXT: .LBB9_3: ; %frem.compute19
; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v2, v4
; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v6, v3
; GFX11-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v5, v4
@@ -6047,11 +6045,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB9_7
-; GFX11-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX11-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body27.preheader
; GFX11-FAKE16-NEXT: s_sub_i32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 11
-; GFX11-FAKE16-NEXT: .LBB9_5: ; %frem.loop_body
+; GFX11-FAKE16-NEXT: .LBB9_5: ; %frem.loop_body27
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v4
@@ -6071,7 +6069,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: ; %bb.6: ; %Flow55
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, s2
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v7
-; GFX11-FAKE16-NEXT: .LBB9_7: ; %frem.loop_exit
+; GFX11-FAKE16-NEXT: .LBB9_7: ; %frem.loop_exit28
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, -10, v6
; GFX11-FAKE16-NEXT: v_ldexp_f32 v4, v4, v6
@@ -6097,7 +6095,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v7, v5
; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB9_10
-; GFX11-FAKE16-NEXT: ; %bb.9: ; %frem.else20
+; GFX11-FAKE16-NEXT: ; %bb.9: ; %frem.else
; GFX11-FAKE16-NEXT: v_bfi_b32 v6, 0x7fff, 0, v3
; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v7, v5
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
@@ -6106,7 +6104,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: s_branch .LBB9_16
; GFX11-FAKE16-NEXT: .LBB9_10:
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr6
-; GFX11-FAKE16-NEXT: .LBB9_11: ; %frem.compute19
+; GFX11-FAKE16-NEXT: .LBB9_11: ; %frem.compute
; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v6, v7
; GFX11-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v8, v7
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
@@ -6142,11 +6140,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_div_fixup_f32 v8, v8, v6, 1.0
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB9_15
-; GFX11-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; GFX11-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; GFX11-FAKE16-NEXT: s_sub_i32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 11
-; GFX11-FAKE16-NEXT: .LBB9_13: ; %frem.loop_body27
+; GFX11-FAKE16-NEXT: .LBB9_13: ; %frem.loop_body
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v10, v7
@@ -6166,7 +6164,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: ; %bb.14: ; %Flow
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, s2
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v10
-; GFX11-FAKE16-NEXT: .LBB9_15: ; %frem.loop_exit28
+; GFX11-FAKE16-NEXT: .LBB9_15: ; %frem.loop_exit
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, -10, v9
; GFX11-FAKE16-NEXT: v_ldexp_f32 v7, v7, v9
@@ -6220,7 +6218,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s6, s5
; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB9_2
-; GFX1150-TRUE16-NEXT: ; %bb.1: ; %frem.else
+; GFX1150-TRUE16-NEXT: ; %bb.1: ; %frem.else20
; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v0.l, s4
; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s6, s5
@@ -6232,7 +6230,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: s_branch .LBB9_8
; GFX1150-TRUE16-NEXT: .LBB9_2:
; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr0
-; GFX1150-TRUE16-NEXT: .LBB9_3: ; %frem.compute
+; GFX1150-TRUE16-NEXT: .LBB9_3: ; %frem.compute19
; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s5
; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v0, s6
; GFX1150-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s6
@@ -6267,11 +6265,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4
; GFX1150-TRUE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; GFX1150-TRUE16-NEXT: s_cbranch_vccnz .LBB9_7
-; GFX1150-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX1150-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body27.preheader
; GFX1150-TRUE16-NEXT: s_sub_i32 s5, s6, s5
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-TRUE16-NEXT: s_add_i32 s5, s5, 11
-; GFX1150-TRUE16-NEXT: .LBB9_5: ; %frem.loop_body
+; GFX1150-TRUE16-NEXT: .LBB9_5: ; %frem.loop_body27
; GFX1150-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v5, v2
@@ -6293,7 +6291,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: ; %bb.6: ; %Flow55
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v4, s5
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, v5
-; GFX1150-TRUE16-NEXT: .LBB9_7: ; %frem.loop_exit
+; GFX1150-TRUE16-NEXT: .LBB9_7: ; %frem.loop_exit28
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v4, -10, v4
; GFX1150-TRUE16-NEXT: v_ldexp_f32 v2, v2, v4
@@ -6323,7 +6321,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s8, s7
; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB9_10
-; GFX1150-TRUE16-NEXT: ; %bb.9: ; %frem.else20
+; GFX1150-TRUE16-NEXT: ; %bb.9: ; %frem.else
; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, s6
; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s8, s7
@@ -6335,7 +6333,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: s_branch .LBB9_16
; GFX1150-TRUE16-NEXT: .LBB9_10:
; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr1
-; GFX1150-TRUE16-NEXT: .LBB9_11: ; %frem.compute19
+; GFX1150-TRUE16-NEXT: .LBB9_11: ; %frem.compute
; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v2, s7
; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s8
; GFX1150-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s8
@@ -6370,11 +6368,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5
; GFX1150-TRUE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; GFX1150-TRUE16-NEXT: s_cbranch_vccnz .LBB9_15
-; GFX1150-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; GFX1150-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; GFX1150-TRUE16-NEXT: s_sub_i32 s7, s8, s7
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-TRUE16-NEXT: s_add_i32 s7, s7, 11
-; GFX1150-TRUE16-NEXT: .LBB9_13: ; %frem.loop_body27
+; GFX1150-TRUE16-NEXT: .LBB9_13: ; %frem.loop_body
; GFX1150-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v6, v3
@@ -6396,7 +6394,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: ; %bb.14: ; %Flow
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v5, s7
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v3, v6
-; GFX1150-TRUE16-NEXT: .LBB9_15: ; %frem.loop_exit28
+; GFX1150-TRUE16-NEXT: .LBB9_15: ; %frem.loop_exit
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v5, -10, v5
; GFX1150-TRUE16-NEXT: v_ldexp_f32 v3, v3, v5
@@ -6422,19 +6420,16 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: s_cselect_b32 s3, -1, 0
; GFX1150-TRUE16-NEXT: s_cmp_nge_f16 s2, 0x7c00
; GFX1150-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
-; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-TRUE16-NEXT: s_and_b32 s2, s2, s3
; GFX1150-TRUE16-NEXT: s_cmp_lg_f16 s5, 0
; GFX1150-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s2
; GFX1150-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
; GFX1150-TRUE16-NEXT: s_cmp_nge_f16 s4, 0x7c00
-; GFX1150-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX1150-TRUE16-NEXT: s_cselect_b32 s3, -1, 0
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1150-TRUE16-NEXT: s_and_b32 s2, s3, s2
-; GFX1150-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, s2
-; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX1150-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v1.l, s2
; GFX1150-TRUE16-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX1150-TRUE16-NEXT: s_endpgm
;
@@ -6459,7 +6454,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX1150-FAKE16-NEXT: s_cmp_ngt_f32 s6, s5
; GFX1150-FAKE16-NEXT: s_cbranch_scc0 .LBB9_2
-; GFX1150-FAKE16-NEXT: ; %bb.1: ; %frem.else
+; GFX1150-FAKE16-NEXT: ; %bb.1: ; %frem.else20
; GFX1150-FAKE16-NEXT: s_cmp_eq_f32 s6, s5
; GFX1150-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, 0, s4
; GFX1150-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -6469,7 +6464,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: s_branch .LBB9_8
; GFX1150-FAKE16-NEXT: .LBB9_2:
; GFX1150-FAKE16-NEXT: ; implicit-def: $vgpr0
-; GFX1150-FAKE16-NEXT: .LBB9_3: ; %frem.compute
+; GFX1150-FAKE16-NEXT: .LBB9_3: ; %frem.compute19
; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s5
; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v0, s6
; GFX1150-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s6
@@ -6504,11 +6499,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4
; GFX1150-FAKE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; GFX1150-FAKE16-NEXT: s_cbranch_vccnz .LBB9_7
-; GFX1150-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX1150-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body27.preheader
; GFX1150-FAKE16-NEXT: s_sub_i32 s5, s6, s5
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-FAKE16-NEXT: s_add_i32 s5, s5, 11
-; GFX1150-FAKE16-NEXT: .LBB9_5: ; %frem.loop_body
+; GFX1150-FAKE16-NEXT: .LBB9_5: ; %frem.loop_body27
; GFX1150-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v5, v2
@@ -6530,7 +6525,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: ; %bb.6: ; %Flow55
; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v4, s5
; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v2, v5
-; GFX1150-FAKE16-NEXT: .LBB9_7: ; %frem.loop_exit
+; GFX1150-FAKE16-NEXT: .LBB9_7: ; %frem.loop_exit28
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v4, -10, v4
; GFX1150-FAKE16-NEXT: v_ldexp_f32 v2, v2, v4
@@ -6559,7 +6554,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX1150-FAKE16-NEXT: s_cmp_ngt_f32 s8, s7
; GFX1150-FAKE16-NEXT: s_cbranch_scc0 .LBB9_10
-; GFX1150-FAKE16-NEXT: ; %bb.9: ; %frem.else20
+; GFX1150-FAKE16-NEXT: ; %bb.9: ; %frem.else
; GFX1150-FAKE16-NEXT: s_cmp_eq_f32 s8, s7
; GFX1150-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, 0, s6
; GFX1150-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -6569,7 +6564,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: s_branch .LBB9_16
; GFX1150-FAKE16-NEXT: .LBB9_10:
; GFX1150-FAKE16-NEXT: ; implicit-def: $vgpr1
-; GFX1150-FAKE16-NEXT: .LBB9_11: ; %frem.compute19
+; GFX1150-FAKE16-NEXT: .LBB9_11: ; %frem.compute
; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v2, s7
; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s8
; GFX1150-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s8
@@ -6604,11 +6599,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5
; GFX1150-FAKE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; GFX1150-FAKE16-NEXT: s_cbranch_vccnz .LBB9_15
-; GFX1150-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; GFX1150-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; GFX1150-FAKE16-NEXT: s_sub_i32 s7, s8, s7
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-FAKE16-NEXT: s_add_i32 s7, s7, 11
-; GFX1150-FAKE16-NEXT: .LBB9_13: ; %frem.loop_body27
+; GFX1150-FAKE16-NEXT: .LBB9_13: ; %frem.loop_body
; GFX1150-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v6, v3
@@ -6630,7 +6625,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: ; %bb.14: ; %Flow
; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v5, s7
; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v3, v6
-; GFX1150-FAKE16-NEXT: .LBB9_15: ; %frem.loop_exit28
+; GFX1150-FAKE16-NEXT: .LBB9_15: ; %frem.loop_exit
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v5, -10, v5
; GFX1150-FAKE16-NEXT: v_ldexp_f32 v3, v3, v5
@@ -6690,7 +6685,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s6, s5
; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB9_2
-; GFX1200-TRUE16-NEXT: ; %bb.1: ; %frem.else
+; GFX1200-TRUE16-NEXT: ; %bb.1: ; %frem.else20
; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v0.l, s4
; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s6, s5
@@ -6702,7 +6697,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: s_branch .LBB9_8
; GFX1200-TRUE16-NEXT: .LBB9_2:
; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr0
-; GFX1200-TRUE16-NEXT: .LBB9_3: ; %frem.compute
+; GFX1200-TRUE16-NEXT: .LBB9_3: ; %frem.compute19
; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s5
; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v0, s6
; GFX1200-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s6
@@ -6737,11 +6732,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4
; GFX1200-TRUE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; GFX1200-TRUE16-NEXT: s_cbranch_vccnz .LBB9_7
-; GFX1200-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX1200-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body27.preheader
; GFX1200-TRUE16-NEXT: s_sub_co_i32 s5, s6, s5
; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX1200-TRUE16-NEXT: s_add_co_i32 s5, s5, 11
-; GFX1200-TRUE16-NEXT: .LBB9_5: ; %frem.loop_body
+; GFX1200-TRUE16-NEXT: .LBB9_5: ; %frem.loop_body27
; GFX1200-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v5, v2
@@ -6765,7 +6760,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: ; %bb.6: ; %Flow55
; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v4, s5
; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, v5
-; GFX1200-TRUE16-NEXT: .LBB9_7: ; %frem.loop_exit
+; GFX1200-TRUE16-NEXT: .LBB9_7: ; %frem.loop_exit28
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v4, -10, v4
; GFX1200-TRUE16-NEXT: v_ldexp_f32 v2, v2, v4
@@ -6799,7 +6794,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s8, s7
; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB9_10
-; GFX1200-TRUE16-NEXT: ; %bb.9: ; %frem.else20
+; GFX1200-TRUE16-NEXT: ; %bb.9: ; %frem.else
; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, s6
; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s8, s7
@@ -6811,7 +6806,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: s_branch .LBB9_16
; GFX1200-TRUE16-NEXT: .LBB9_10:
; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr1
-; GFX1200-TRUE16-NEXT: .LBB9_11: ; %frem.compute19
+; GFX1200-TRUE16-NEXT: .LBB9_11: ; %frem.compute
; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v2, s7
; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s8
; GFX1200-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s8
@@ -6847,11 +6842,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5
; GFX1200-TRUE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; GFX1200-TRUE16-NEXT: s_cbranch_vccnz .LBB9_15
-; GFX1200-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; GFX1200-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; GFX1200-TRUE16-NEXT: s_sub_co_i32 s7, s8, s7
; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX1200-TRUE16-NEXT: s_add_co_i32 s7, s7, 11
-; GFX1200-TRUE16-NEXT: .LBB9_13: ; %frem.loop_body27
+; GFX1200-TRUE16-NEXT: .LBB9_13: ; %frem.loop_body
; GFX1200-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v6, v3
@@ -6875,7 +6870,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: ; %bb.14: ; %Flow
; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v5, s7
; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v3, v6
-; GFX1200-TRUE16-NEXT: .LBB9_15: ; %frem.loop_exit28
+; GFX1200-TRUE16-NEXT: .LBB9_15: ; %frem.loop_exit
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v5, -10, v5
; GFX1200-TRUE16-NEXT: v_ldexp_f32 v3, v3, v5
@@ -6902,20 +6897,17 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: s_cselect_b32 s3, -1, 0
; GFX1200-TRUE16-NEXT: s_cmp_nge_f16 s2, 0x7c00
; GFX1200-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
-; GFX1200-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1200-TRUE16-NEXT: s_and_b32 s2, s2, s3
; GFX1200-TRUE16-NEXT: s_cmp_lg_f16 s5, 0
; GFX1200-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s2
; GFX1200-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
; GFX1200-TRUE16-NEXT: s_cmp_nge_f16 s4, 0x7c00
-; GFX1200-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX1200-TRUE16-NEXT: s_cselect_b32 s3, -1, 0
; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX1200-TRUE16-NEXT: s_and_b32 s2, s3, s2
; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX1200-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, s2
-; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX1200-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v1.l, s2
; GFX1200-TRUE16-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX1200-TRUE16-NEXT: s_endpgm
;
@@ -6940,7 +6932,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX1200-FAKE16-NEXT: s_cmp_ngt_f32 s6, s5
; GFX1200-FAKE16-NEXT: s_cbranch_scc0 .LBB9_2
-; GFX1200-FAKE16-NEXT: ; %bb.1: ; %frem.else
+; GFX1200-FAKE16-NEXT: ; %bb.1: ; %frem.else20
; GFX1200-FAKE16-NEXT: s_cmp_eq_f32 s6, s5
; GFX1200-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, 0, s4
; GFX1200-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -6950,7 +6942,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: s_branch .LBB9_8
; GFX1200-FAKE16-NEXT: .LBB9_2:
; GFX1200-FAKE16-NEXT: ; implicit-def: $vgpr0
-; GFX1200-FAKE16-NEXT: .LBB9_3: ; %frem.compute
+; GFX1200-FAKE16-NEXT: .LBB9_3: ; %frem.compute19
; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s5
; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v0, s6
; GFX1200-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s6
@@ -6986,11 +6978,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4
; GFX1200-FAKE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; GFX1200-FAKE16-NEXT: s_cbranch_vccnz .LBB9_7
-; GFX1200-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX1200-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body27.preheader
; GFX1200-FAKE16-NEXT: s_sub_co_i32 s5, s6, s5
; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX1200-FAKE16-NEXT: s_add_co_i32 s5, s5, 11
-; GFX1200-FAKE16-NEXT: .LBB9_5: ; %frem.loop_body
+; GFX1200-FAKE16-NEXT: .LBB9_5: ; %frem.loop_body27
; GFX1200-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v5, v2
@@ -7014,7 +7006,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: ; %bb.6: ; %Flow55
; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v4, s5
; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v2, v5
-; GFX1200-FAKE16-NEXT: .LBB9_7: ; %frem.loop_exit
+; GFX1200-FAKE16-NEXT: .LBB9_7: ; %frem.loop_exit28
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v4, -10, v4
; GFX1200-FAKE16-NEXT: v_ldexp_f32 v2, v2, v4
@@ -7047,7 +7039,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
; GFX1200-FAKE16-NEXT: s_cmp_ngt_f32 s8, s7
; GFX1200-FAKE16-NEXT: s_cbranch_scc0 .LBB9_10
-; GFX1200-FAKE16-NEXT: ; %bb.9: ; %frem.else20
+; GFX1200-FAKE16-NEXT: ; %bb.9: ; %frem.else
; GFX1200-FAKE16-NEXT: s_cmp_eq_f32 s8, s7
; GFX1200-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, 0, s6
; GFX1200-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -7058,7 +7050,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: s_branch .LBB9_16
; GFX1200-FAKE16-NEXT: .LBB9_10:
; GFX1200-FAKE16-NEXT: ; implicit-def: $vgpr1
-; GFX1200-FAKE16-NEXT: .LBB9_11: ; %frem.compute19
+; GFX1200-FAKE16-NEXT: .LBB9_11: ; %frem.compute
; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v2, s7
; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s8
; GFX1200-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s8
@@ -7094,11 +7086,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5
; GFX1200-FAKE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; GFX1200-FAKE16-NEXT: s_cbranch_vccnz .LBB9_15
-; GFX1200-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; GFX1200-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; GFX1200-FAKE16-NEXT: s_sub_co_i32 s7, s8, s7
; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX1200-FAKE16-NEXT: s_add_co_i32 s7, s7, 11
-; GFX1200-FAKE16-NEXT: .LBB9_13: ; %frem.loop_body27
+; GFX1200-FAKE16-NEXT: .LBB9_13: ; %frem.loop_body
; GFX1200-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v6, v3
@@ -7122,7 +7114,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: ; %bb.14: ; %Flow
; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v5, s7
; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v3, v6
-; GFX1200-FAKE16-NEXT: .LBB9_15: ; %frem.loop_exit28
+; GFX1200-FAKE16-NEXT: .LBB9_15: ; %frem.loop_exit
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v5, -10, v5
; GFX1200-FAKE16-NEXT: v_ldexp_f32 v3, v3, v5
@@ -7208,7 +7200,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: s_and_b64 vcc, exec, s[2:3]
; SI-NEXT: v_cvt_f16_f32_e32 v8, v6
; SI-NEXT: s_cbranch_vccz .LBB10_2
-; SI-NEXT: ; %bb.1: ; %frem.else
+; SI-NEXT: ; %bb.1: ; %frem.else86
; SI-NEXT: v_bfi_b32 v11, s0, 0, v6
; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
; SI-NEXT: v_cmp_eq_f32_e32 vcc, v9, v10
@@ -7219,7 +7211,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: .LBB10_2:
; SI-NEXT: ; implicit-def: $vgpr8
; SI-NEXT: s_mov_b64 vcc, 0
-; SI-NEXT: .LBB10_3: ; %frem.compute
+; SI-NEXT: .LBB10_3: ; %frem.compute85
; SI-NEXT: s_mov_b32 s3, 0x7f800000
; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v9|, s3
; SI-NEXT: v_frexp_exp_i32_f32_e32 v8, v9
@@ -7255,10 +7247,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_div_fixup_f32 v10, v10, v8, 1.0
; SI-NEXT: s_cmp_lt_i32 s1, 12
; SI-NEXT: s_cbranch_scc1 .LBB10_7
-; SI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; SI-NEXT: ; %bb.4: ; %frem.loop_body93.preheader
; SI-NEXT: s_sub_i32 s1, s2, s3
; SI-NEXT: s_add_i32 s1, s1, 11
-; SI-NEXT: .LBB10_5: ; %frem.loop_body
+; SI-NEXT: .LBB10_5: ; %frem.loop_body93
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: v_mov_b32_e32 v11, v9
; SI-NEXT: v_mul_f32_e32 v9, v11, v10
@@ -7273,7 +7265,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: s_cbranch_scc1 .LBB10_5
; SI-NEXT: ; %bb.6: ; %Flow133
; SI-NEXT: v_mov_b32_e32 v9, v11
-; SI-NEXT: .LBB10_7: ; %frem.loop_exit
+; SI-NEXT: .LBB10_7: ; %frem.loop_exit94
; SI-NEXT: s_add_i32 s1, s1, -10
; SI-NEXT: v_ldexp_f32_e64 v9, v9, s1
; SI-NEXT: v_mul_f32_e32 v10, v9, v10
@@ -7294,7 +7286,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_cvt_f32_f16_e64 v11, |v11|
; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v10, v11
; SI-NEXT: s_cbranch_vccz .LBB10_10
-; SI-NEXT: ; %bb.9: ; %frem.else20
+; SI-NEXT: ; %bb.9: ; %frem.else53
; SI-NEXT: s_brev_b32 s0, -2
; SI-NEXT: v_bfi_b32 v12, s0, 0, v4
; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
@@ -7306,7 +7298,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: .LBB10_10:
; SI-NEXT: ; implicit-def: $vgpr9
; SI-NEXT: s_mov_b64 vcc, 0
-; SI-NEXT: .LBB10_11: ; %frem.compute19
+; SI-NEXT: .LBB10_11: ; %frem.compute52
; SI-NEXT: s_mov_b32 s3, 0x7f800000
; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v10|, s3
; SI-NEXT: v_frexp_exp_i32_f32_e32 v9, v10
@@ -7342,10 +7334,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_div_fixup_f32 v11, v11, v9, 1.0
; SI-NEXT: s_cmp_lt_i32 s1, 12
; SI-NEXT: s_cbranch_scc1 .LBB10_15
-; SI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; SI-NEXT: ; %bb.12: ; %frem.loop_body60.preheader
; SI-NEXT: s_sub_i32 s1, s2, s3
; SI-NEXT: s_add_i32 s1, s1, 11
-; SI-NEXT: .LBB10_13: ; %frem.loop_body27
+; SI-NEXT: .LBB10_13: ; %frem.loop_body60
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: v_mov_b32_e32 v12, v10
; SI-NEXT: v_mul_f32_e32 v10, v12, v11
@@ -7360,7 +7352,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: s_cbranch_scc1 .LBB10_13
; SI-NEXT: ; %bb.14: ; %Flow129
; SI-NEXT: v_mov_b32_e32 v10, v12
-; SI-NEXT: .LBB10_15: ; %frem.loop_exit28
+; SI-NEXT: .LBB10_15: ; %frem.loop_exit61
; SI-NEXT: s_add_i32 s1, s1, -10
; SI-NEXT: v_ldexp_f32_e64 v10, v10, s1
; SI-NEXT: v_mul_f32_e32 v11, v10, v11
@@ -7381,7 +7373,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_cvt_f32_f16_e64 v12, |v12|
; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v11, v12
; SI-NEXT: s_cbranch_vccz .LBB10_18
-; SI-NEXT: ; %bb.17: ; %frem.else53
+; SI-NEXT: ; %bb.17: ; %frem.else20
; SI-NEXT: s_brev_b32 s0, -2
; SI-NEXT: v_bfi_b32 v13, s0, 0, v2
; SI-NEXT: v_cvt_f32_f16_e32 v10, v10
@@ -7393,7 +7385,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: .LBB10_18:
; SI-NEXT: ; implicit-def: $vgpr10
; SI-NEXT: s_mov_b64 vcc, 0
-; SI-NEXT: .LBB10_19: ; %frem.compute52
+; SI-NEXT: .LBB10_19: ; %frem.compute19
; SI-NEXT: s_mov_b32 s3, 0x7f800000
; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v11|, s3
; SI-NEXT: v_frexp_exp_i32_f32_e32 v10, v11
@@ -7429,10 +7421,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_div_fixup_f32 v12, v12, v10, 1.0
; SI-NEXT: s_cmp_lt_i32 s1, 12
; SI-NEXT: s_cbranch_scc1 .LBB10_23
-; SI-NEXT: ; %bb.20: ; %frem.loop_body60.preheader
+; SI-NEXT: ; %bb.20: ; %frem.loop_body27.preheader
; SI-NEXT: s_sub_i32 s1, s2, s3
; SI-NEXT: s_add_i32 s1, s1, 11
-; SI-NEXT: .LBB10_21: ; %frem.loop_body60
+; SI-NEXT: .LBB10_21: ; %frem.loop_body27
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: v_mov_b32_e32 v13, v11
; SI-NEXT: v_mul_f32_e32 v11, v13, v12
@@ -7447,7 +7439,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: s_cbranch_scc1 .LBB10_21
; SI-NEXT: ; %bb.22: ; %Flow125
; SI-NEXT: v_mov_b32_e32 v11, v13
-; SI-NEXT: .LBB10_23: ; %frem.loop_exit61
+; SI-NEXT: .LBB10_23: ; %frem.loop_exit28
; SI-NEXT: s_add_i32 s1, s1, -10
; SI-NEXT: v_ldexp_f32_e64 v11, v11, s1
; SI-NEXT: v_mul_f32_e32 v12, v11, v12
@@ -7468,7 +7460,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_cvt_f32_f16_e64 v13, |v13|
; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v12, v13
; SI-NEXT: s_cbranch_vccz .LBB10_26
-; SI-NEXT: ; %bb.25: ; %frem.else86
+; SI-NEXT: ; %bb.25: ; %frem.else
; SI-NEXT: s_brev_b32 s0, -2
; SI-NEXT: v_bfi_b32 v14, s0, 0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v11, v11
@@ -7480,7 +7472,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: .LBB10_26:
; SI-NEXT: ; implicit-def: $vgpr11
; SI-NEXT: s_mov_b64 vcc, 0
-; SI-NEXT: .LBB10_27: ; %frem.compute85
+; SI-NEXT: .LBB10_27: ; %frem.compute
; SI-NEXT: s_mov_b32 s3, 0x7f800000
; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v12|, s3
; SI-NEXT: v_frexp_exp_i32_f32_e32 v11, v12
@@ -7516,10 +7508,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_div_fixup_f32 v13, v13, v11, 1.0
; SI-NEXT: s_cmp_lt_i32 s1, 12
; SI-NEXT: s_cbranch_scc1 .LBB10_31
-; SI-NEXT: ; %bb.28: ; %frem.loop_body93.preheader
+; SI-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; SI-NEXT: s_sub_i32 s1, s2, s3
; SI-NEXT: s_add_i32 s1, s1, 11
-; SI-NEXT: .LBB10_29: ; %frem.loop_body93
+; SI-NEXT: .LBB10_29: ; %frem.loop_body
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: v_mov_b32_e32 v14, v12
; SI-NEXT: v_mul_f32_e32 v12, v14, v13
@@ -7534,7 +7526,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: s_cbranch_scc1 .LBB10_29
; SI-NEXT: ; %bb.30: ; %Flow
; SI-NEXT: v_mov_b32_e32 v12, v14
-; SI-NEXT: .LBB10_31: ; %frem.loop_exit94
+; SI-NEXT: .LBB10_31: ; %frem.loop_exit
; SI-NEXT: s_add_i32 s1, s1, -10
; SI-NEXT: v_ldexp_f32_e64 v12, v12, s1
; SI-NEXT: v_mul_f32_e32 v13, v12, v13
@@ -7638,7 +7630,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_and_b32_e32 v9, 0x7fffffff, v7
; CI-NEXT: s_and_b64 vcc, exec, s[2:3]
; CI-NEXT: s_cbranch_vccz .LBB10_2
-; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: ; %bb.1: ; %frem.else86
; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
; CI-NEXT: v_bfi_b32 v11, s0, 0, v6
; CI-NEXT: v_cmp_eq_f32_e32 vcc, v10, v9
@@ -7647,7 +7639,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB10_8
; CI-NEXT: .LBB10_2:
; CI-NEXT: ; implicit-def: $vgpr8
-; CI-NEXT: .LBB10_3: ; %frem.compute
+; CI-NEXT: .LBB10_3: ; %frem.compute85
; CI-NEXT: v_frexp_exp_i32_f32_e32 v13, v10
; CI-NEXT: v_frexp_mant_f32_e32 v8, v10
; CI-NEXT: v_frexp_mant_f32_e32 v10, v9
@@ -7672,10 +7664,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v10
; CI-NEXT: v_div_fixup_f32 v12, v12, v9, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB10_7
-; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: ; %bb.4: ; %frem.loop_body93.preheader
; CI-NEXT: v_sub_i32_e32 v10, vcc, v13, v14
; CI-NEXT: v_add_i32_e32 v10, vcc, 11, v10
-; CI-NEXT: .LBB10_5: ; %frem.loop_body
+; CI-NEXT: .LBB10_5: ; %frem.loop_body93
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v13, v11
; CI-NEXT: v_mul_f32_e32 v11, v13, v12
@@ -7690,7 +7682,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_cbranch_vccnz .LBB10_5
; CI-NEXT: ; %bb.6: ; %Flow133
; CI-NEXT: v_mov_b32_e32 v11, v13
-; CI-NEXT: .LBB10_7: ; %frem.loop_exit
+; CI-NEXT: .LBB10_7: ; %frem.loop_exit94
; CI-NEXT: v_add_i32_e32 v10, vcc, -10, v10
; CI-NEXT: v_ldexp_f32_e32 v10, v11, v10
; CI-NEXT: v_mul_f32_e32 v11, v10, v12
@@ -7711,7 +7703,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cvt_f32_f16_e64 v10, |v10|
; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v11, v10
; CI-NEXT: s_cbranch_vccz .LBB10_10
-; CI-NEXT: ; %bb.9: ; %frem.else20
+; CI-NEXT: ; %bb.9: ; %frem.else53
; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
; CI-NEXT: s_brev_b32 s0, -2
; CI-NEXT: v_bfi_b32 v12, s0, 0, v4
@@ -7721,7 +7713,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB10_16
; CI-NEXT: .LBB10_10:
; CI-NEXT: ; implicit-def: $vgpr9
-; CI-NEXT: .LBB10_11: ; %frem.compute19
+; CI-NEXT: .LBB10_11: ; %frem.compute52
; CI-NEXT: v_frexp_exp_i32_f32_e32 v14, v11
; CI-NEXT: v_frexp_mant_f32_e32 v9, v11
; CI-NEXT: v_frexp_mant_f32_e32 v11, v10
@@ -7746,10 +7738,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v11
; CI-NEXT: v_div_fixup_f32 v13, v13, v10, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB10_15
-; CI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; CI-NEXT: ; %bb.12: ; %frem.loop_body60.preheader
; CI-NEXT: v_sub_i32_e32 v11, vcc, v14, v15
; CI-NEXT: v_add_i32_e32 v11, vcc, 11, v11
-; CI-NEXT: .LBB10_13: ; %frem.loop_body27
+; CI-NEXT: .LBB10_13: ; %frem.loop_body60
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v14, v12
; CI-NEXT: v_mul_f32_e32 v12, v14, v13
@@ -7764,7 +7756,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_cbranch_vccnz .LBB10_13
; CI-NEXT: ; %bb.14: ; %Flow129
; CI-NEXT: v_mov_b32_e32 v12, v14
-; CI-NEXT: .LBB10_15: ; %frem.loop_exit28
+; CI-NEXT: .LBB10_15: ; %frem.loop_exit61
; CI-NEXT: v_add_i32_e32 v11, vcc, -10, v11
; CI-NEXT: v_ldexp_f32_e32 v11, v12, v11
; CI-NEXT: v_mul_f32_e32 v12, v11, v13
@@ -7785,7 +7777,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cvt_f32_f16_e64 v11, |v11|
; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v12, v11
; CI-NEXT: s_cbranch_vccz .LBB10_18
-; CI-NEXT: ; %bb.17: ; %frem.else53
+; CI-NEXT: ; %bb.17: ; %frem.else20
; CI-NEXT: v_cvt_f32_f16_e32 v10, v10
; CI-NEXT: s_brev_b32 s0, -2
; CI-NEXT: v_bfi_b32 v13, s0, 0, v2
@@ -7795,7 +7787,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB10_24
; CI-NEXT: .LBB10_18:
; CI-NEXT: ; implicit-def: $vgpr10
-; CI-NEXT: .LBB10_19: ; %frem.compute52
+; CI-NEXT: .LBB10_19: ; %frem.compute19
; CI-NEXT: v_frexp_exp_i32_f32_e32 v15, v12
; CI-NEXT: v_frexp_mant_f32_e32 v10, v12
; CI-NEXT: v_frexp_mant_f32_e32 v12, v11
@@ -7820,10 +7812,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v12
; CI-NEXT: v_div_fixup_f32 v14, v14, v11, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB10_23
-; CI-NEXT: ; %bb.20: ; %frem.loop_body60.preheader
+; CI-NEXT: ; %bb.20: ; %frem.loop_body27.preheader
; CI-NEXT: v_sub_i32_e32 v12, vcc, v15, v16
; CI-NEXT: v_add_i32_e32 v12, vcc, 11, v12
-; CI-NEXT: .LBB10_21: ; %frem.loop_body60
+; CI-NEXT: .LBB10_21: ; %frem.loop_body27
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v15, v13
; CI-NEXT: v_mul_f32_e32 v13, v15, v14
@@ -7838,7 +7830,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_cbranch_vccnz .LBB10_21
; CI-NEXT: ; %bb.22: ; %Flow125
; CI-NEXT: v_mov_b32_e32 v13, v15
-; CI-NEXT: .LBB10_23: ; %frem.loop_exit61
+; CI-NEXT: .LBB10_23: ; %frem.loop_exit28
; CI-NEXT: v_add_i32_e32 v12, vcc, -10, v12
; CI-NEXT: v_ldexp_f32_e32 v12, v13, v12
; CI-NEXT: v_mul_f32_e32 v13, v12, v14
@@ -7859,7 +7851,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cvt_f32_f16_e64 v12, |v12|
; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v13, v12
; CI-NEXT: s_cbranch_vccz .LBB10_26
-; CI-NEXT: ; %bb.25: ; %frem.else86
+; CI-NEXT: ; %bb.25: ; %frem.else
; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
; CI-NEXT: s_brev_b32 s0, -2
; CI-NEXT: v_bfi_b32 v14, s0, 0, v0
@@ -7869,7 +7861,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB10_32
; CI-NEXT: .LBB10_26:
; CI-NEXT: ; implicit-def: $vgpr11
-; CI-NEXT: .LBB10_27: ; %frem.compute85
+; CI-NEXT: .LBB10_27: ; %frem.compute
; CI-NEXT: v_frexp_exp_i32_f32_e32 v16, v13
; CI-NEXT: v_frexp_mant_f32_e32 v11, v13
; CI-NEXT: v_frexp_mant_f32_e32 v13, v12
@@ -7894,10 +7886,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v13
; CI-NEXT: v_div_fixup_f32 v15, v15, v12, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB10_31
-; CI-NEXT: ; %bb.28: ; %frem.loop_body93.preheader
+; CI-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; CI-NEXT: v_sub_i32_e32 v13, vcc, v16, v17
; CI-NEXT: v_add_i32_e32 v13, vcc, 11, v13
-; CI-NEXT: .LBB10_29: ; %frem.loop_body93
+; CI-NEXT: .LBB10_29: ; %frem.loop_body
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v16, v14
; CI-NEXT: v_mul_f32_e32 v14, v16, v15
@@ -7912,7 +7904,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_cbranch_vccnz .LBB10_29
; CI-NEXT: ; %bb.30: ; %Flow
; CI-NEXT: v_mov_b32_e32 v14, v16
-; CI-NEXT: .LBB10_31: ; %frem.loop_exit94
+; CI-NEXT: .LBB10_31: ; %frem.loop_exit
; CI-NEXT: v_add_i32_e32 v13, vcc, -10, v13
; CI-NEXT: v_ldexp_f32_e32 v13, v14, v13
; CI-NEXT: v_mul_f32_e32 v14, v13, v15
@@ -8001,7 +7993,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cvt_f32_f16_e64 v5, |v2|
; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v6, v5
; VI-NEXT: s_cbranch_vccz .LBB10_2
-; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: ; %bb.1: ; %frem.else86
; VI-NEXT: s_movk_i32 s2, 0x7fff
; VI-NEXT: v_bfi_b32 v4, s2, 0, v0
; VI-NEXT: v_cmp_eq_f32_e32 vcc, v6, v5
@@ -8010,7 +8002,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB10_8
; VI-NEXT: .LBB10_2:
; VI-NEXT: ; implicit-def: $vgpr4
-; VI-NEXT: .LBB10_3: ; %frem.compute
+; VI-NEXT: .LBB10_3: ; %frem.compute85
; VI-NEXT: v_frexp_exp_i32_f32_e32 v9, v6
; VI-NEXT: v_frexp_mant_f32_e32 v4, v6
; VI-NEXT: v_frexp_mant_f32_e32 v6, v5
@@ -8035,10 +8027,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v6
; VI-NEXT: v_div_fixup_f32 v8, v8, v5, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB10_7
-; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: ; %bb.4: ; %frem.loop_body93.preheader
; VI-NEXT: v_sub_u32_e32 v6, vcc, v9, v10
; VI-NEXT: v_add_u32_e32 v6, vcc, 11, v6
-; VI-NEXT: .LBB10_5: ; %frem.loop_body
+; VI-NEXT: .LBB10_5: ; %frem.loop_body93
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v9, v7
; VI-NEXT: v_mul_f32_e32 v7, v9, v8
@@ -8053,7 +8045,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_cbranch_vccnz .LBB10_5
; VI-NEXT: ; %bb.6: ; %Flow133
; VI-NEXT: v_mov_b32_e32 v7, v9
-; VI-NEXT: .LBB10_7: ; %frem.loop_exit
+; VI-NEXT: .LBB10_7: ; %frem.loop_exit94
; VI-NEXT: v_add_u32_e32 v6, vcc, -10, v6
; VI-NEXT: v_ldexp_f32 v6, v7, v6
; VI-NEXT: v_mul_f32_e32 v7, v6, v8
@@ -8073,7 +8065,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cvt_f32_f16_e64 v8, |v6|
; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v9, v8
; VI-NEXT: s_cbranch_vccz .LBB10_10
-; VI-NEXT: ; %bb.9: ; %frem.else20
+; VI-NEXT: ; %bb.9: ; %frem.else53
; VI-NEXT: s_movk_i32 s2, 0x7fff
; VI-NEXT: v_bfi_b32 v7, s2, 0, v5
; VI-NEXT: v_cmp_eq_f32_e32 vcc, v9, v8
@@ -8082,7 +8074,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB10_16
; VI-NEXT: .LBB10_10:
; VI-NEXT: ; implicit-def: $vgpr7
-; VI-NEXT: .LBB10_11: ; %frem.compute19
+; VI-NEXT: .LBB10_11: ; %frem.compute52
; VI-NEXT: v_frexp_exp_i32_f32_e32 v12, v9
; VI-NEXT: v_frexp_mant_f32_e32 v7, v9
; VI-NEXT: v_frexp_mant_f32_e32 v9, v8
@@ -8107,10 +8099,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v9
; VI-NEXT: v_div_fixup_f32 v11, v11, v8, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB10_15
-; VI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; VI-NEXT: ; %bb.12: ; %frem.loop_body60.preheader
; VI-NEXT: v_sub_u32_e32 v9, vcc, v12, v13
; VI-NEXT: v_add_u32_e32 v9, vcc, 11, v9
-; VI-NEXT: .LBB10_13: ; %frem.loop_body27
+; VI-NEXT: .LBB10_13: ; %frem.loop_body60
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v12, v10
; VI-NEXT: v_mul_f32_e32 v10, v12, v11
@@ -8125,7 +8117,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_cbranch_vccnz .LBB10_13
; VI-NEXT: ; %bb.14: ; %Flow129
; VI-NEXT: v_mov_b32_e32 v10, v12
-; VI-NEXT: .LBB10_15: ; %frem.loop_exit28
+; VI-NEXT: .LBB10_15: ; %frem.loop_exit61
; VI-NEXT: v_add_u32_e32 v9, vcc, -10, v9
; VI-NEXT: v_ldexp_f32 v9, v10, v9
; VI-NEXT: v_mul_f32_e32 v10, v9, v11
@@ -8143,7 +8135,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cvt_f32_f16_e64 v9, |v3|
; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v10, v9
; VI-NEXT: s_cbranch_vccz .LBB10_18
-; VI-NEXT: ; %bb.17: ; %frem.else53
+; VI-NEXT: ; %bb.17: ; %frem.else20
; VI-NEXT: s_movk_i32 s2, 0x7fff
; VI-NEXT: v_bfi_b32 v8, s2, 0, v1
; VI-NEXT: v_cmp_eq_f32_e32 vcc, v10, v9
@@ -8152,7 +8144,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB10_24
; VI-NEXT: .LBB10_18:
; VI-NEXT: ; implicit-def: $vgpr8
-; VI-NEXT: .LBB10_19: ; %frem.compute52
+; VI-NEXT: .LBB10_19: ; %frem.compute19
; VI-NEXT: v_frexp_exp_i32_f32_e32 v13, v10
; VI-NEXT: v_frexp_mant_f32_e32 v8, v10
; VI-NEXT: v_frexp_mant_f32_e32 v10, v9
@@ -8177,10 +8169,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v10
; VI-NEXT: v_div_fixup_f32 v12, v12, v9, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB10_23
-; VI-NEXT: ; %bb.20: ; %frem.loop_body60.preheader
+; VI-NEXT: ; %bb.20: ; %frem.loop_body27.preheader
; VI-NEXT: v_sub_u32_e32 v10, vcc, v13, v14
; VI-NEXT: v_add_u32_e32 v10, vcc, 11, v10
-; VI-NEXT: .LBB10_21: ; %frem.loop_body60
+; VI-NEXT: .LBB10_21: ; %frem.loop_body27
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v13, v11
; VI-NEXT: v_mul_f32_e32 v11, v13, v12
@@ -8195,7 +8187,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_cbranch_vccnz .LBB10_21
; VI-NEXT: ; %bb.22: ; %Flow125
; VI-NEXT: v_mov_b32_e32 v11, v13
-; VI-NEXT: .LBB10_23: ; %frem.loop_exit61
+; VI-NEXT: .LBB10_23: ; %frem.loop_exit28
; VI-NEXT: v_add_u32_e32 v10, vcc, -10, v10
; VI-NEXT: v_ldexp_f32 v10, v11, v10
; VI-NEXT: v_mul_f32_e32 v11, v10, v12
@@ -8215,7 +8207,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cvt_f32_f16_e64 v12, |v10|
; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v13, v12
; VI-NEXT: s_cbranch_vccz .LBB10_26
-; VI-NEXT: ; %bb.25: ; %frem.else86
+; VI-NEXT: ; %bb.25: ; %frem.else
; VI-NEXT: s_movk_i32 s2, 0x7fff
; VI-NEXT: v_bfi_b32 v11, s2, 0, v9
; VI-NEXT: v_cmp_eq_f32_e32 vcc, v13, v12
@@ -8224,7 +8216,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB10_32
; VI-NEXT: .LBB10_26:
; VI-NEXT: ; implicit-def: $vgpr11
-; VI-NEXT: .LBB10_27: ; %frem.compute85
+; VI-NEXT: .LBB10_27: ; %frem.compute
; VI-NEXT: v_frexp_exp_i32_f32_e32 v16, v13
; VI-NEXT: v_frexp_mant_f32_e32 v11, v13
; VI-NEXT: v_frexp_mant_f32_e32 v13, v12
@@ -8249,10 +8241,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v13
; VI-NEXT: v_div_fixup_f32 v15, v15, v12, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB10_31
-; VI-NEXT: ; %bb.28: ; %frem.loop_body93.preheader
+; VI-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; VI-NEXT: v_sub_u32_e32 v13, vcc, v16, v17
; VI-NEXT: v_add_u32_e32 v13, vcc, 11, v13
-; VI-NEXT: .LBB10_29: ; %frem.loop_body93
+; VI-NEXT: .LBB10_29: ; %frem.loop_body
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v16, v14
; VI-NEXT: v_mul_f32_e32 v14, v16, v15
@@ -8267,7 +8259,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_cbranch_vccnz .LBB10_29
; VI-NEXT: ; %bb.30: ; %Flow
; VI-NEXT: v_mov_b32_e32 v14, v16
-; VI-NEXT: .LBB10_31: ; %frem.loop_exit94
+; VI-NEXT: .LBB10_31: ; %frem.loop_exit
; VI-NEXT: v_add_u32_e32 v13, vcc, -10, v13
; VI-NEXT: v_ldexp_f32 v13, v14, v13
; VI-NEXT: v_mul_f32_e32 v14, v13, v15
@@ -8320,7 +8312,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cvt_f32_f16_e64 v5, |v0|
; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v6, v5
; GFX9-NEXT: s_cbranch_vccz .LBB10_2
-; GFX9-NEXT: ; %bb.1: ; %frem.else
+; GFX9-NEXT: ; %bb.1: ; %frem.else86
; GFX9-NEXT: s_movk_i32 s2, 0x7fff
; GFX9-NEXT: v_bfi_b32 v4, s2, 0, v2
; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v6, v5
@@ -8329,7 +8321,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_branch .LBB10_8
; GFX9-NEXT: .LBB10_2:
; GFX9-NEXT: ; implicit-def: $vgpr4
-; GFX9-NEXT: .LBB10_3: ; %frem.compute
+; GFX9-NEXT: .LBB10_3: ; %frem.compute85
; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v9, v6
; GFX9-NEXT: v_frexp_mant_f32_e32 v4, v6
; GFX9-NEXT: v_frexp_mant_f32_e32 v6, v5
@@ -8354,10 +8346,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 12, v6
; GFX9-NEXT: v_div_fixup_f32 v8, v8, v5, 1.0
; GFX9-NEXT: s_cbranch_vccnz .LBB10_7
-; GFX9-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX9-NEXT: ; %bb.4: ; %frem.loop_body93.preheader
; GFX9-NEXT: v_sub_u32_e32 v6, v9, v10
; GFX9-NEXT: v_add_u32_e32 v6, 11, v6
-; GFX9-NEXT: .LBB10_5: ; %frem.loop_body
+; GFX9-NEXT: .LBB10_5: ; %frem.loop_body93
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v9, v7
; GFX9-NEXT: v_mul_f32_e32 v7, v9, v8
@@ -8372,7 +8364,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_cbranch_vccnz .LBB10_5
; GFX9-NEXT: ; %bb.6: ; %Flow133
; GFX9-NEXT: v_mov_b32_e32 v7, v9
-; GFX9-NEXT: .LBB10_7: ; %frem.loop_exit
+; GFX9-NEXT: .LBB10_7: ; %frem.loop_exit94
; GFX9-NEXT: v_add_u32_e32 v6, -10, v6
; GFX9-NEXT: v_ldexp_f32 v6, v7, v6
; GFX9-NEXT: v_mul_f32_e32 v7, v6, v8
@@ -8391,7 +8383,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cvt_f32_f16_sdwa v7, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v8, v7
; GFX9-NEXT: s_cbranch_vccz .LBB10_10
-; GFX9-NEXT: ; %bb.9: ; %frem.else20
+; GFX9-NEXT: ; %bb.9: ; %frem.else53
; GFX9-NEXT: s_movk_i32 s2, 0x7fff
; GFX9-NEXT: v_bfi_b32 v6, s2, 0, v5
; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v8, v7
@@ -8400,7 +8392,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_branch .LBB10_16
; GFX9-NEXT: .LBB10_10:
; GFX9-NEXT: ; implicit-def: $vgpr6
-; GFX9-NEXT: .LBB10_11: ; %frem.compute19
+; GFX9-NEXT: .LBB10_11: ; %frem.compute52
; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v11, v8
; GFX9-NEXT: v_frexp_mant_f32_e32 v6, v8
; GFX9-NEXT: v_frexp_mant_f32_e32 v8, v7
@@ -8425,10 +8417,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 12, v8
; GFX9-NEXT: v_div_fixup_f32 v10, v10, v7, 1.0
; GFX9-NEXT: s_cbranch_vccnz .LBB10_15
-; GFX9-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; GFX9-NEXT: ; %bb.12: ; %frem.loop_body60.preheader
; GFX9-NEXT: v_sub_u32_e32 v8, v11, v12
; GFX9-NEXT: v_add_u32_e32 v8, 11, v8
-; GFX9-NEXT: .LBB10_13: ; %frem.loop_body27
+; GFX9-NEXT: .LBB10_13: ; %frem.loop_body60
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v11, v9
; GFX9-NEXT: v_mul_f32_e32 v9, v11, v10
@@ -8443,7 +8435,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_cbranch_vccnz .LBB10_13
; GFX9-NEXT: ; %bb.14: ; %Flow129
; GFX9-NEXT: v_mov_b32_e32 v9, v11
-; GFX9-NEXT: .LBB10_15: ; %frem.loop_exit28
+; GFX9-NEXT: .LBB10_15: ; %frem.loop_exit61
; GFX9-NEXT: v_add_u32_e32 v8, -10, v8
; GFX9-NEXT: v_ldexp_f32 v8, v9, v8
; GFX9-NEXT: v_mul_f32_e32 v9, v8, v10
@@ -8461,7 +8453,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cvt_f32_f16_e64 v8, |v1|
; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v9, v8
; GFX9-NEXT: s_cbranch_vccz .LBB10_18
-; GFX9-NEXT: ; %bb.17: ; %frem.else53
+; GFX9-NEXT: ; %bb.17: ; %frem.else20
; GFX9-NEXT: s_movk_i32 s2, 0x7fff
; GFX9-NEXT: v_bfi_b32 v7, s2, 0, v3
; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v9, v8
@@ -8470,7 +8462,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_branch .LBB10_24
; GFX9-NEXT: .LBB10_18:
; GFX9-NEXT: ; implicit-def: $vgpr7
-; GFX9-NEXT: .LBB10_19: ; %frem.compute52
+; GFX9-NEXT: .LBB10_19: ; %frem.compute19
; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v12, v9
; GFX9-NEXT: v_frexp_mant_f32_e32 v7, v9
; GFX9-NEXT: v_frexp_mant_f32_e32 v9, v8
@@ -8495,10 +8487,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 12, v9
; GFX9-NEXT: v_div_fixup_f32 v11, v11, v8, 1.0
; GFX9-NEXT: s_cbranch_vccnz .LBB10_23
-; GFX9-NEXT: ; %bb.20: ; %frem.loop_body60.preheader
+; GFX9-NEXT: ; %bb.20: ; %frem.loop_body27.preheader
; GFX9-NEXT: v_sub_u32_e32 v9, v12, v13
; GFX9-NEXT: v_add_u32_e32 v9, 11, v9
-; GFX9-NEXT: .LBB10_21: ; %frem.loop_body60
+; GFX9-NEXT: .LBB10_21: ; %frem.loop_body27
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v12, v10
; GFX9-NEXT: v_mul_f32_e32 v10, v12, v11
@@ -8513,7 +8505,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_cbranch_vccnz .LBB10_21
; GFX9-NEXT: ; %bb.22: ; %Flow125
; GFX9-NEXT: v_mov_b32_e32 v10, v12
-; GFX9-NEXT: .LBB10_23: ; %frem.loop_exit61
+; GFX9-NEXT: .LBB10_23: ; %frem.loop_exit28
; GFX9-NEXT: v_add_u32_e32 v9, -10, v9
; GFX9-NEXT: v_ldexp_f32 v9, v10, v9
; GFX9-NEXT: v_mul_f32_e32 v10, v9, v11
@@ -8532,7 +8524,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cvt_f32_f16_sdwa v10, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v11, v10
; GFX9-NEXT: s_cbranch_vccz .LBB10_26
-; GFX9-NEXT: ; %bb.25: ; %frem.else86
+; GFX9-NEXT: ; %bb.25: ; %frem.else
; GFX9-NEXT: s_movk_i32 s2, 0x7fff
; GFX9-NEXT: v_bfi_b32 v9, s2, 0, v8
; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v11, v10
@@ -8541,7 +8533,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_branch .LBB10_32
; GFX9-NEXT: .LBB10_26:
; GFX9-NEXT: ; implicit-def: $vgpr9
-; GFX9-NEXT: .LBB10_27: ; %frem.compute85
+; GFX9-NEXT: .LBB10_27: ; %frem.compute
; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v14, v11
; GFX9-NEXT: v_frexp_mant_f32_e32 v9, v11
; GFX9-NEXT: v_frexp_mant_f32_e32 v11, v10
@@ -8566,10 +8558,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 12, v11
; GFX9-NEXT: v_div_fixup_f32 v13, v13, v10, 1.0
; GFX9-NEXT: s_cbranch_vccnz .LBB10_31
-; GFX9-NEXT: ; %bb.28: ; %frem.loop_body93.preheader
+; GFX9-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; GFX9-NEXT: v_sub_u32_e32 v11, v14, v15
; GFX9-NEXT: v_add_u32_e32 v11, 11, v11
-; GFX9-NEXT: .LBB10_29: ; %frem.loop_body93
+; GFX9-NEXT: .LBB10_29: ; %frem.loop_body
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v14, v12
; GFX9-NEXT: v_mul_f32_e32 v12, v14, v13
@@ -8584,7 +8576,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_cbranch_vccnz .LBB10_29
; GFX9-NEXT: ; %bb.30: ; %Flow
; GFX9-NEXT: v_mov_b32_e32 v12, v14
-; GFX9-NEXT: .LBB10_31: ; %frem.loop_exit94
+; GFX9-NEXT: .LBB10_31: ; %frem.loop_exit
; GFX9-NEXT: v_add_u32_e32 v11, -10, v11
; GFX9-NEXT: v_ldexp_f32 v11, v12, v11
; GFX9-NEXT: v_mul_f32_e32 v12, v11, v13
@@ -8640,7 +8632,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cvt_f32_f16_e64 v5, |v0|
; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v6, v5
; GFX10-NEXT: s_cbranch_vccz .LBB10_2
-; GFX10-NEXT: ; %bb.1: ; %frem.else
+; GFX10-NEXT: ; %bb.1: ; %frem.else86
; GFX10-NEXT: v_bfi_b32 v4, 0x7fff, 0, v2
; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v6, v5
; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v4, vcc_lo
@@ -8648,7 +8640,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: s_branch .LBB10_8
; GFX10-NEXT: .LBB10_2:
; GFX10-NEXT: ; implicit-def: $vgpr4
-; GFX10-NEXT: .LBB10_3: ; %frem.compute
+; GFX10-NEXT: .LBB10_3: ; %frem.compute85
; GFX10-NEXT: v_frexp_mant_f32_e32 v4, v6
; GFX10-NEXT: v_frexp_mant_f32_e32 v8, v5
; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v7, v6
@@ -8675,10 +8667,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v8
; GFX10-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0
; GFX10-NEXT: s_cbranch_vccnz .LBB10_7
-; GFX10-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX10-NEXT: ; %bb.4: ; %frem.loop_body93.preheader
; GFX10-NEXT: s_sub_i32 s2, s2, s3
; GFX10-NEXT: s_add_i32 s2, s2, 11
-; GFX10-NEXT: .LBB10_5: ; %frem.loop_body
+; GFX10-NEXT: .LBB10_5: ; %frem.loop_body93
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_mov_b32_e32 v9, v6
; GFX10-NEXT: s_add_i32 s2, s2, -11
@@ -8694,7 +8686,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: ; %bb.6: ; %Flow133
; GFX10-NEXT: v_mov_b32_e32 v8, s2
; GFX10-NEXT: v_mov_b32_e32 v6, v9
-; GFX10-NEXT: .LBB10_7: ; %frem.loop_exit
+; GFX10-NEXT: .LBB10_7: ; %frem.loop_exit94
; GFX10-NEXT: v_add_nc_u32_e32 v8, -10, v8
; GFX10-NEXT: v_ldexp_f32 v6, v6, v8
; GFX10-NEXT: v_mul_f32_e32 v7, v6, v7
@@ -8712,7 +8704,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cvt_f32_f16_e64 v8, |v5|
; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v8, v7
; GFX10-NEXT: s_cbranch_vccz .LBB10_10
-; GFX10-NEXT: ; %bb.9: ; %frem.else20
+; GFX10-NEXT: ; %bb.9: ; %frem.else53
; GFX10-NEXT: v_bfi_b32 v6, 0x7fff, 0, v5
; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v8, v7
; GFX10-NEXT: v_cndmask_b32_e32 v6, v5, v6, vcc_lo
@@ -8720,7 +8712,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: s_branch .LBB10_16
; GFX10-NEXT: .LBB10_10:
; GFX10-NEXT: ; implicit-def: $vgpr6
-; GFX10-NEXT: .LBB10_11: ; %frem.compute19
+; GFX10-NEXT: .LBB10_11: ; %frem.compute52
; GFX10-NEXT: v_frexp_mant_f32_e32 v6, v8
; GFX10-NEXT: v_frexp_mant_f32_e32 v10, v7
; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v9, v8
@@ -8747,10 +8739,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v10
; GFX10-NEXT: v_div_fixup_f32 v9, v9, v7, 1.0
; GFX10-NEXT: s_cbranch_vccnz .LBB10_15
-; GFX10-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; GFX10-NEXT: ; %bb.12: ; %frem.loop_body60.preheader
; GFX10-NEXT: s_sub_i32 s2, s2, s3
; GFX10-NEXT: s_add_i32 s2, s2, 11
-; GFX10-NEXT: .LBB10_13: ; %frem.loop_body27
+; GFX10-NEXT: .LBB10_13: ; %frem.loop_body60
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_mov_b32_e32 v11, v8
; GFX10-NEXT: s_add_i32 s2, s2, -11
@@ -8766,7 +8758,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: ; %bb.14: ; %Flow129
; GFX10-NEXT: v_mov_b32_e32 v10, s2
; GFX10-NEXT: v_mov_b32_e32 v8, v11
-; GFX10-NEXT: .LBB10_15: ; %frem.loop_exit28
+; GFX10-NEXT: .LBB10_15: ; %frem.loop_exit61
; GFX10-NEXT: v_add_nc_u32_e32 v10, -10, v10
; GFX10-NEXT: v_ldexp_f32 v8, v8, v10
; GFX10-NEXT: v_mul_f32_e32 v9, v8, v9
@@ -8783,7 +8775,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cvt_f32_f16_e64 v8, |v1|
; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v9, v8
; GFX10-NEXT: s_cbranch_vccz .LBB10_18
-; GFX10-NEXT: ; %bb.17: ; %frem.else53
+; GFX10-NEXT: ; %bb.17: ; %frem.else20
; GFX10-NEXT: v_bfi_b32 v7, 0x7fff, 0, v3
; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v9, v8
; GFX10-NEXT: v_cndmask_b32_e32 v7, v3, v7, vcc_lo
@@ -8791,7 +8783,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: s_branch .LBB10_24
; GFX10-NEXT: .LBB10_18:
; GFX10-NEXT: ; implicit-def: $vgpr7
-; GFX10-NEXT: .LBB10_19: ; %frem.compute52
+; GFX10-NEXT: .LBB10_19: ; %frem.compute19
; GFX10-NEXT: v_frexp_mant_f32_e32 v7, v9
; GFX10-NEXT: v_frexp_mant_f32_e32 v11, v8
; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v10, v9
@@ -8818,10 +8810,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v11
; GFX10-NEXT: v_div_fixup_f32 v10, v10, v8, 1.0
; GFX10-NEXT: s_cbranch_vccnz .LBB10_23
-; GFX10-NEXT: ; %bb.20: ; %frem.loop_body60.preheader
+; GFX10-NEXT: ; %bb.20: ; %frem.loop_body27.preheader
; GFX10-NEXT: s_sub_i32 s2, s2, s3
; GFX10-NEXT: s_add_i32 s2, s2, 11
-; GFX10-NEXT: .LBB10_21: ; %frem.loop_body60
+; GFX10-NEXT: .LBB10_21: ; %frem.loop_body27
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_mov_b32_e32 v12, v9
; GFX10-NEXT: s_add_i32 s2, s2, -11
@@ -8837,7 +8829,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: ; %bb.22: ; %Flow125
; GFX10-NEXT: v_mov_b32_e32 v11, s2
; GFX10-NEXT: v_mov_b32_e32 v9, v12
-; GFX10-NEXT: .LBB10_23: ; %frem.loop_exit61
+; GFX10-NEXT: .LBB10_23: ; %frem.loop_exit28
; GFX10-NEXT: v_add_nc_u32_e32 v11, -10, v11
; GFX10-NEXT: v_ldexp_f32 v9, v9, v11
; GFX10-NEXT: v_mul_f32_e32 v10, v9, v10
@@ -8855,7 +8847,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cvt_f32_f16_e64 v11, |v8|
; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v11, v10
; GFX10-NEXT: s_cbranch_vccz .LBB10_26
-; GFX10-NEXT: ; %bb.25: ; %frem.else86
+; GFX10-NEXT: ; %bb.25: ; %frem.else
; GFX10-NEXT: v_bfi_b32 v9, 0x7fff, 0, v8
; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v11, v10
; GFX10-NEXT: v_cndmask_b32_e32 v9, v8, v9, vcc_lo
@@ -8863,7 +8855,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: s_branch .LBB10_32
; GFX10-NEXT: .LBB10_26:
; GFX10-NEXT: ; implicit-def: $vgpr9
-; GFX10-NEXT: .LBB10_27: ; %frem.compute85
+; GFX10-NEXT: .LBB10_27: ; %frem.compute
; GFX10-NEXT: v_frexp_mant_f32_e32 v9, v11
; GFX10-NEXT: v_frexp_mant_f32_e32 v13, v10
; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v12, v11
@@ -8890,10 +8882,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v13
; GFX10-NEXT: v_div_fixup_f32 v12, v12, v10, 1.0
; GFX10-NEXT: s_cbranch_vccnz .LBB10_31
-; GFX10-NEXT: ; %bb.28: ; %frem.loop_body93.preheader
+; GFX10-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; GFX10-NEXT: s_sub_i32 s2, s2, s3
; GFX10-NEXT: s_add_i32 s2, s2, 11
-; GFX10-NEXT: .LBB10_29: ; %frem.loop_body93
+; GFX10-NEXT: .LBB10_29: ; %frem.loop_body
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_mov_b32_e32 v14, v11
; GFX10-NEXT: s_add_i32 s2, s2, -11
@@ -8909,7 +8901,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: ; %bb.30: ; %Flow
; GFX10-NEXT: v_mov_b32_e32 v13, s2
; GFX10-NEXT: v_mov_b32_e32 v11, v14
-; GFX10-NEXT: .LBB10_31: ; %frem.loop_exit94
+; GFX10-NEXT: .LBB10_31: ; %frem.loop_exit
; GFX10-NEXT: v_add_nc_u32_e32 v13, -10, v13
; GFX10-NEXT: v_ldexp_f32 v11, v11, v13
; GFX10-NEXT: v_mul_f32_e32 v12, v11, v12
@@ -8963,7 +8955,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v6, v5
; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB10_2
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %frem.else
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %frem.else86
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v6, v5
@@ -8974,7 +8966,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_branch .LBB10_8
; GFX11-TRUE16-NEXT: .LBB10_2:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4
-; GFX11-TRUE16-NEXT: .LBB10_3: ; %frem.compute
+; GFX11-TRUE16-NEXT: .LBB10_3: ; %frem.compute85
; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v4, v6
; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v8, v5
; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v7, v6
@@ -9010,11 +9002,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB10_7
-; GFX11-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX11-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body93.preheader
; GFX11-TRUE16-NEXT: s_sub_i32 s2, s2, s3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 11
-; GFX11-TRUE16-NEXT: .LBB10_5: ; %frem.loop_body
+; GFX11-TRUE16-NEXT: .LBB10_5: ; %frem.loop_body93
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, v6
@@ -9034,7 +9026,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: ; %bb.6: ; %Flow133
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s2
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v9
-; GFX11-TRUE16-NEXT: .LBB10_7: ; %frem.loop_exit
+; GFX11-TRUE16-NEXT: .LBB10_7: ; %frem.loop_exit94
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, -10, v8
; GFX11-TRUE16-NEXT: v_ldexp_f32 v6, v6, v8
@@ -9061,7 +9053,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v9, v8
; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB10_10
-; GFX11-TRUE16-NEXT: ; %bb.9: ; %frem.else20
+; GFX11-TRUE16-NEXT: ; %bb.9: ; %frem.else53
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, 0
; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v9, v8
@@ -9072,7 +9064,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_branch .LBB10_16
; GFX11-TRUE16-NEXT: .LBB10_10:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7
-; GFX11-TRUE16-NEXT: .LBB10_11: ; %frem.compute19
+; GFX11-TRUE16-NEXT: .LBB10_11: ; %frem.compute52
; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v7, v9
; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v11, v8
; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v10, v9
@@ -9108,11 +9100,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_div_fixup_f32 v10, v10, v8, 1.0
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB10_15
-; GFX11-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; GFX11-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body60.preheader
; GFX11-TRUE16-NEXT: s_sub_i32 s2, s2, s3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 11
-; GFX11-TRUE16-NEXT: .LBB10_13: ; %frem.loop_body27
+; GFX11-TRUE16-NEXT: .LBB10_13: ; %frem.loop_body60
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v12, v9
@@ -9132,7 +9124,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: ; %bb.14: ; %Flow129
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s2
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, v12
-; GFX11-TRUE16-NEXT: .LBB10_15: ; %frem.loop_exit28
+; GFX11-TRUE16-NEXT: .LBB10_15: ; %frem.loop_exit61
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, -10, v11
; GFX11-TRUE16-NEXT: v_ldexp_f32 v9, v9, v11
@@ -9156,7 +9148,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v10, v9
; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB10_18
-; GFX11-TRUE16-NEXT: ; %bb.17: ; %frem.else53
+; GFX11-TRUE16-NEXT: ; %bb.17: ; %frem.else20
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v1.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0
; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v10, v9
@@ -9167,7 +9159,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_branch .LBB10_24
; GFX11-TRUE16-NEXT: .LBB10_18:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8
-; GFX11-TRUE16-NEXT: .LBB10_19: ; %frem.compute52
+; GFX11-TRUE16-NEXT: .LBB10_19: ; %frem.compute19
; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v8, v10
; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v12, v9
; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v11, v10
@@ -9203,11 +9195,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_div_fixup_f32 v11, v11, v9, 1.0
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB10_23
-; GFX11-TRUE16-NEXT: ; %bb.20: ; %frem.loop_body60.preheader
+; GFX11-TRUE16-NEXT: ; %bb.20: ; %frem.loop_body27.preheader
; GFX11-TRUE16-NEXT: s_sub_i32 s2, s2, s3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 11
-; GFX11-TRUE16-NEXT: .LBB10_21: ; %frem.loop_body60
+; GFX11-TRUE16-NEXT: .LBB10_21: ; %frem.loop_body27
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v10
@@ -9227,7 +9219,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: ; %bb.22: ; %Flow125
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v12, s2
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v10, v13
-; GFX11-TRUE16-NEXT: .LBB10_23: ; %frem.loop_exit61
+; GFX11-TRUE16-NEXT: .LBB10_23: ; %frem.loop_exit28
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, -10, v12
; GFX11-TRUE16-NEXT: v_ldexp_f32 v10, v10, v12
@@ -9254,7 +9246,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v13, v12
; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB10_26
-; GFX11-TRUE16-NEXT: ; %bb.25: ; %frem.else86
+; GFX11-TRUE16-NEXT: ; %bb.25: ; %frem.else
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v9.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, 0
; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v13, v12
@@ -9265,7 +9257,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_branch .LBB10_32
; GFX11-TRUE16-NEXT: .LBB10_26:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11
-; GFX11-TRUE16-NEXT: .LBB10_27: ; %frem.compute85
+; GFX11-TRUE16-NEXT: .LBB10_27: ; %frem.compute
; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v11, v13
; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v15, v12
; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v14, v13
@@ -9301,11 +9293,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_div_fixup_f32 v14, v14, v12, 1.0
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB10_31
-; GFX11-TRUE16-NEXT: ; %bb.28: ; %frem.loop_body93.preheader
+; GFX11-TRUE16-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; GFX11-TRUE16-NEXT: s_sub_i32 s2, s2, s3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 11
-; GFX11-TRUE16-NEXT: .LBB10_29: ; %frem.loop_body93
+; GFX11-TRUE16-NEXT: .LBB10_29: ; %frem.loop_body
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, v13
@@ -9325,7 +9317,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: ; %bb.30: ; %Flow
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s2
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v16
-; GFX11-TRUE16-NEXT: .LBB10_31: ; %frem.loop_exit94
+; GFX11-TRUE16-NEXT: .LBB10_31: ; %frem.loop_exit
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, -10, v15
; GFX11-TRUE16-NEXT: v_ldexp_f32 v13, v13, v15
@@ -9346,29 +9338,23 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: .LBB10_32: ; %Flow124
; GFX11-TRUE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v2.l
; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e64 s2, 0x7c00, |v0.l|
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v6.l
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v4.l, s2
; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e64 s2, 0x7c00, |v5.l|
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v3.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, 0x7e00, v7.l, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v7.l, s2
; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e64 s2, 0x7c00, |v1.l|
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v2, 16, v0
; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v10.l
; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v8.l, s2
; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e64 s2, 0x7c00, |v9.l|
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, 0x7e00, v11.l, s2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v4, 16, v1
-; GFX11-TRUE16-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x7e00, v11.l, s2
+; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: frem_v4f16:
@@ -9388,7 +9374,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v6, v5
; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB10_2
-; GFX11-FAKE16-NEXT: ; %bb.1: ; %frem.else
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %frem.else86
; GFX11-FAKE16-NEXT: v_bfi_b32 v4, 0x7fff, 0, v0
; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v6, v5
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
@@ -9397,7 +9383,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: s_branch .LBB10_8
; GFX11-FAKE16-NEXT: .LBB10_2:
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4
-; GFX11-FAKE16-NEXT: .LBB10_3: ; %frem.compute
+; GFX11-FAKE16-NEXT: .LBB10_3: ; %frem.compute85
; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v4, v6
; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v8, v5
; GFX11-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v7, v6
@@ -9433,11 +9419,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB10_7
-; GFX11-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX11-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body93.preheader
; GFX11-FAKE16-NEXT: s_sub_i32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 11
-; GFX11-FAKE16-NEXT: .LBB10_5: ; %frem.loop_body
+; GFX11-FAKE16-NEXT: .LBB10_5: ; %frem.loop_body93
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, v6
@@ -9457,7 +9443,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: ; %bb.6: ; %Flow133
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, s2
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v9
-; GFX11-FAKE16-NEXT: .LBB10_7: ; %frem.loop_exit
+; GFX11-FAKE16-NEXT: .LBB10_7: ; %frem.loop_exit94
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, -10, v8
; GFX11-FAKE16-NEXT: v_ldexp_f32 v6, v6, v8
@@ -9483,7 +9469,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v9, v8
; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB10_10
-; GFX11-FAKE16-NEXT: ; %bb.9: ; %frem.else20
+; GFX11-FAKE16-NEXT: ; %bb.9: ; %frem.else53
; GFX11-FAKE16-NEXT: v_bfi_b32 v7, 0x7fff, 0, v5
; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v9, v8
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
@@ -9492,7 +9478,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: s_branch .LBB10_16
; GFX11-FAKE16-NEXT: .LBB10_10:
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr7
-; GFX11-FAKE16-NEXT: .LBB10_11: ; %frem.compute19
+; GFX11-FAKE16-NEXT: .LBB10_11: ; %frem.compute52
; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v7, v9
; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v11, v8
; GFX11-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v10, v9
@@ -9528,11 +9514,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_div_fixup_f32 v10, v10, v8, 1.0
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB10_15
-; GFX11-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; GFX11-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body60.preheader
; GFX11-FAKE16-NEXT: s_sub_i32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 11
-; GFX11-FAKE16-NEXT: .LBB10_13: ; %frem.loop_body27
+; GFX11-FAKE16-NEXT: .LBB10_13: ; %frem.loop_body60
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, v9
@@ -9552,7 +9538,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: ; %bb.14: ; %Flow129
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, s2
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, v12
-; GFX11-FAKE16-NEXT: .LBB10_15: ; %frem.loop_exit28
+; GFX11-FAKE16-NEXT: .LBB10_15: ; %frem.loop_exit61
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, -10, v11
; GFX11-FAKE16-NEXT: v_ldexp_f32 v9, v9, v11
@@ -9575,7 +9561,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v10, v9
; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB10_18
-; GFX11-FAKE16-NEXT: ; %bb.17: ; %frem.else53
+; GFX11-FAKE16-NEXT: ; %bb.17: ; %frem.else20
; GFX11-FAKE16-NEXT: v_bfi_b32 v8, 0x7fff, 0, v1
; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v10, v9
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
@@ -9584,7 +9570,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: s_branch .LBB10_24
; GFX11-FAKE16-NEXT: .LBB10_18:
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr8
-; GFX11-FAKE16-NEXT: .LBB10_19: ; %frem.compute52
+; GFX11-FAKE16-NEXT: .LBB10_19: ; %frem.compute19
; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v8, v10
; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v12, v9
; GFX11-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v11, v10
@@ -9620,11 +9606,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_div_fixup_f32 v11, v11, v9, 1.0
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB10_23
-; GFX11-FAKE16-NEXT: ; %bb.20: ; %frem.loop_body60.preheader
+; GFX11-FAKE16-NEXT: ; %bb.20: ; %frem.loop_body27.preheader
; GFX11-FAKE16-NEXT: s_sub_i32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 11
-; GFX11-FAKE16-NEXT: .LBB10_21: ; %frem.loop_body60
+; GFX11-FAKE16-NEXT: .LBB10_21: ; %frem.loop_body27
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v13, v10
@@ -9644,7 +9630,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: ; %bb.22: ; %Flow125
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, s2
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v10, v13
-; GFX11-FAKE16-NEXT: .LBB10_23: ; %frem.loop_exit61
+; GFX11-FAKE16-NEXT: .LBB10_23: ; %frem.loop_exit28
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, -10, v12
; GFX11-FAKE16-NEXT: v_ldexp_f32 v10, v10, v12
@@ -9670,7 +9656,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v13, v12
; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB10_26
-; GFX11-FAKE16-NEXT: ; %bb.25: ; %frem.else86
+; GFX11-FAKE16-NEXT: ; %bb.25: ; %frem.else
; GFX11-FAKE16-NEXT: v_bfi_b32 v11, 0x7fff, 0, v9
; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v13, v12
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
@@ -9679,7 +9665,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: s_branch .LBB10_32
; GFX11-FAKE16-NEXT: .LBB10_26:
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr11
-; GFX11-FAKE16-NEXT: .LBB10_27: ; %frem.compute85
+; GFX11-FAKE16-NEXT: .LBB10_27: ; %frem.compute
; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v11, v13
; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v15, v12
; GFX11-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v14, v13
@@ -9715,11 +9701,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_div_fixup_f32 v14, v14, v12, 1.0
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB10_31
-; GFX11-FAKE16-NEXT: ; %bb.28: ; %frem.loop_body93.preheader
+; GFX11-FAKE16-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; GFX11-FAKE16-NEXT: s_sub_i32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 11
-; GFX11-FAKE16-NEXT: .LBB10_29: ; %frem.loop_body93
+; GFX11-FAKE16-NEXT: .LBB10_29: ; %frem.loop_body
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v16, v13
@@ -9739,7 +9725,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: ; %bb.30: ; %Flow
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v15, s2
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v13, v16
-; GFX11-FAKE16-NEXT: .LBB10_31: ; %frem.loop_exit94
+; GFX11-FAKE16-NEXT: .LBB10_31: ; %frem.loop_exit
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, -10, v15
; GFX11-FAKE16-NEXT: v_ldexp_f32 v13, v13, v15
@@ -9804,7 +9790,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s8, s6
; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB10_2
-; GFX1150-TRUE16-NEXT: ; %bb.1: ; %frem.else
+; GFX1150-TRUE16-NEXT: ; %bb.1: ; %frem.else86
; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v0.l, s5
; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s8, s6
@@ -9816,7 +9802,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: s_branch .LBB10_8
; GFX1150-TRUE16-NEXT: .LBB10_2:
; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr0
-; GFX1150-TRUE16-NEXT: .LBB10_3: ; %frem.compute
+; GFX1150-TRUE16-NEXT: .LBB10_3: ; %frem.compute85
; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s6
; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v0, s8
; GFX1150-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s8
@@ -9851,11 +9837,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4
; GFX1150-TRUE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; GFX1150-TRUE16-NEXT: s_cbranch_vccnz .LBB10_7
-; GFX1150-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX1150-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body93.preheader
; GFX1150-TRUE16-NEXT: s_sub_i32 s6, s8, s6
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-TRUE16-NEXT: s_add_i32 s6, s6, 11
-; GFX1150-TRUE16-NEXT: .LBB10_5: ; %frem.loop_body
+; GFX1150-TRUE16-NEXT: .LBB10_5: ; %frem.loop_body93
; GFX1150-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v5, v2
@@ -9877,7 +9863,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: ; %bb.6: ; %Flow133
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v4, s6
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, v5
-; GFX1150-TRUE16-NEXT: .LBB10_7: ; %frem.loop_exit
+; GFX1150-TRUE16-NEXT: .LBB10_7: ; %frem.loop_exit94
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v4, -10, v4
; GFX1150-TRUE16-NEXT: v_ldexp_f32 v2, v2, v4
@@ -9907,7 +9893,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s10, s9
; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB10_10
-; GFX1150-TRUE16-NEXT: ; %bb.9: ; %frem.else20
+; GFX1150-TRUE16-NEXT: ; %bb.9: ; %frem.else53
; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, s8
; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s10, s9
@@ -9919,7 +9905,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: s_branch .LBB10_16
; GFX1150-TRUE16-NEXT: .LBB10_10:
; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr1
-; GFX1150-TRUE16-NEXT: .LBB10_11: ; %frem.compute19
+; GFX1150-TRUE16-NEXT: .LBB10_11: ; %frem.compute52
; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v2, s9
; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s10
; GFX1150-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s10
@@ -9954,11 +9940,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5
; GFX1150-TRUE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; GFX1150-TRUE16-NEXT: s_cbranch_vccnz .LBB10_15
-; GFX1150-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; GFX1150-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body60.preheader
; GFX1150-TRUE16-NEXT: s_sub_i32 s9, s10, s9
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-TRUE16-NEXT: s_add_i32 s9, s9, 11
-; GFX1150-TRUE16-NEXT: .LBB10_13: ; %frem.loop_body27
+; GFX1150-TRUE16-NEXT: .LBB10_13: ; %frem.loop_body60
; GFX1150-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v6, v3
@@ -9980,7 +9966,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: ; %bb.14: ; %Flow129
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v5, s9
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v3, v6
-; GFX1150-TRUE16-NEXT: .LBB10_15: ; %frem.loop_exit28
+; GFX1150-TRUE16-NEXT: .LBB10_15: ; %frem.loop_exit61
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v5, -10, v5
; GFX1150-TRUE16-NEXT: v_ldexp_f32 v3, v3, v5
@@ -10008,7 +9994,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s10, s9
; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB10_18
-; GFX1150-TRUE16-NEXT: ; %bb.17: ; %frem.else53
+; GFX1150-TRUE16-NEXT: ; %bb.17: ; %frem.else20
; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.l, s7
; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s10, s9
@@ -10020,7 +10006,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: s_branch .LBB10_24
; GFX1150-TRUE16-NEXT: .LBB10_18:
; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr2
-; GFX1150-TRUE16-NEXT: .LBB10_19: ; %frem.compute52
+; GFX1150-TRUE16-NEXT: .LBB10_19: ; %frem.compute19
; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v3, s9
; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v2, s10
; GFX1150-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v5, s10
@@ -10055,11 +10041,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v6
; GFX1150-TRUE16-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0
; GFX1150-TRUE16-NEXT: s_cbranch_vccnz .LBB10_23
-; GFX1150-TRUE16-NEXT: ; %bb.20: ; %frem.loop_body60.preheader
+; GFX1150-TRUE16-NEXT: ; %bb.20: ; %frem.loop_body27.preheader
; GFX1150-TRUE16-NEXT: s_sub_i32 s9, s10, s9
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-TRUE16-NEXT: s_add_i32 s9, s9, 11
-; GFX1150-TRUE16-NEXT: .LBB10_21: ; %frem.loop_body60
+; GFX1150-TRUE16-NEXT: .LBB10_21: ; %frem.loop_body27
; GFX1150-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v7, v4
@@ -10081,7 +10067,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: ; %bb.22: ; %Flow125
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v6, s9
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v4, v7
-; GFX1150-TRUE16-NEXT: .LBB10_23: ; %frem.loop_exit61
+; GFX1150-TRUE16-NEXT: .LBB10_23: ; %frem.loop_exit28
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v6, -10, v6
; GFX1150-TRUE16-NEXT: v_ldexp_f32 v4, v4, v6
@@ -10111,7 +10097,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s12, s11
; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB10_26
-; GFX1150-TRUE16-NEXT: ; %bb.25: ; %frem.else86
+; GFX1150-TRUE16-NEXT: ; %bb.25: ; %frem.else
; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v3.l, s10
; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s12, s11
@@ -10123,7 +10109,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: s_branch .LBB10_32
; GFX1150-TRUE16-NEXT: .LBB10_26:
; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr3
-; GFX1150-TRUE16-NEXT: .LBB10_27: ; %frem.compute85
+; GFX1150-TRUE16-NEXT: .LBB10_27: ; %frem.compute
; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v4, s11
; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v3, s12
; GFX1150-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v6, s12
@@ -10158,11 +10144,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v7
; GFX1150-TRUE16-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0
; GFX1150-TRUE16-NEXT: s_cbranch_vccnz .LBB10_31
-; GFX1150-TRUE16-NEXT: ; %bb.28: ; %frem.loop_body93.preheader
+; GFX1150-TRUE16-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; GFX1150-TRUE16-NEXT: s_sub_i32 s11, s12, s11
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-TRUE16-NEXT: s_add_i32 s11, s11, 11
-; GFX1150-TRUE16-NEXT: .LBB10_29: ; %frem.loop_body93
+; GFX1150-TRUE16-NEXT: .LBB10_29: ; %frem.loop_body
; GFX1150-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v8, v5
@@ -10184,7 +10170,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: ; %bb.30: ; %Flow
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v7, s11
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v5, v8
-; GFX1150-TRUE16-NEXT: .LBB10_31: ; %frem.loop_exit94
+; GFX1150-TRUE16-NEXT: .LBB10_31: ; %frem.loop_exit
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v7, -10, v7
; GFX1150-TRUE16-NEXT: v_ldexp_f32 v5, v5, v7
@@ -10209,21 +10195,19 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: s_cselect_b32 s4, -1, 0
; GFX1150-TRUE16-NEXT: s_cmp_nge_f16 s3, 0x7c00
; GFX1150-TRUE16-NEXT: s_cselect_b32 s3, -1, 0
-; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-TRUE16-NEXT: s_and_b32 s3, s3, s4
; GFX1150-TRUE16-NEXT: s_cmp_lg_f16 s6, 0
; GFX1150-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s3
; GFX1150-TRUE16-NEXT: s_cselect_b32 s3, -1, 0
; GFX1150-TRUE16-NEXT: s_cmp_nge_f16 s5, 0x7c00
-; GFX1150-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX1150-TRUE16-NEXT: s_cselect_b32 s4, -1, 0
-; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-TRUE16-NEXT: s_and_b32 s3, s4, s3
; GFX1150-TRUE16-NEXT: s_cmp_lg_f16 s2, 0
-; GFX1150-TRUE16-NEXT: v_cndmask_b16 v4.l, 0x7e00, v1.l, s3
+; GFX1150-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v1.l, s3
; GFX1150-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
; GFX1150-TRUE16-NEXT: s_cmp_nge_f16 s8, 0x7c00
-; GFX1150-TRUE16-NEXT: v_lshl_or_b32 v0, v4, 16, v0
; GFX1150-TRUE16-NEXT: s_cselect_b32 s3, -1, 0
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-TRUE16-NEXT: s_and_b32 s2, s3, s2
@@ -10232,13 +10216,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, 0
; GFX1150-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
; GFX1150-TRUE16-NEXT: s_cmp_nge_f16 s7, 0x7c00
-; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1150-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX1150-TRUE16-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1150-TRUE16-NEXT: s_and_b32 s2, s3, s2
-; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_cndmask_b16 v3.l, 0x7e00, v3.l, s2
-; GFX1150-TRUE16-NEXT: v_lshl_or_b32 v1, v3, 16, v1
+; GFX1150-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x7e00, v3.l, s2
; GFX1150-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX1150-TRUE16-NEXT: s_endpgm
;
@@ -10265,7 +10246,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX1150-FAKE16-NEXT: s_cmp_ngt_f32 s8, s6
; GFX1150-FAKE16-NEXT: s_cbranch_scc0 .LBB10_2
-; GFX1150-FAKE16-NEXT: ; %bb.1: ; %frem.else
+; GFX1150-FAKE16-NEXT: ; %bb.1: ; %frem.else86
; GFX1150-FAKE16-NEXT: s_cmp_eq_f32 s8, s6
; GFX1150-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, 0, s5
; GFX1150-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -10275,7 +10256,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: s_branch .LBB10_8
; GFX1150-FAKE16-NEXT: .LBB10_2:
; GFX1150-FAKE16-NEXT: ; implicit-def: $vgpr0
-; GFX1150-FAKE16-NEXT: .LBB10_3: ; %frem.compute
+; GFX1150-FAKE16-NEXT: .LBB10_3: ; %frem.compute85
; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s6
; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v0, s8
; GFX1150-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s8
@@ -10310,11 +10291,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4
; GFX1150-FAKE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; GFX1150-FAKE16-NEXT: s_cbranch_vccnz .LBB10_7
-; GFX1150-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX1150-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body93.preheader
; GFX1150-FAKE16-NEXT: s_sub_i32 s6, s8, s6
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-FAKE16-NEXT: s_add_i32 s6, s6, 11
-; GFX1150-FAKE16-NEXT: .LBB10_5: ; %frem.loop_body
+; GFX1150-FAKE16-NEXT: .LBB10_5: ; %frem.loop_body93
; GFX1150-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v5, v2
@@ -10336,7 +10317,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: ; %bb.6: ; %Flow133
; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v4, s6
; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v2, v5
-; GFX1150-FAKE16-NEXT: .LBB10_7: ; %frem.loop_exit
+; GFX1150-FAKE16-NEXT: .LBB10_7: ; %frem.loop_exit94
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v4, -10, v4
; GFX1150-FAKE16-NEXT: v_ldexp_f32 v2, v2, v4
@@ -10365,7 +10346,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX1150-FAKE16-NEXT: s_cmp_ngt_f32 s10, s9
; GFX1150-FAKE16-NEXT: s_cbranch_scc0 .LBB10_10
-; GFX1150-FAKE16-NEXT: ; %bb.9: ; %frem.else20
+; GFX1150-FAKE16-NEXT: ; %bb.9: ; %frem.else53
; GFX1150-FAKE16-NEXT: s_cmp_eq_f32 s10, s9
; GFX1150-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, 0, s8
; GFX1150-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -10375,7 +10356,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: s_branch .LBB10_16
; GFX1150-FAKE16-NEXT: .LBB10_10:
; GFX1150-FAKE16-NEXT: ; implicit-def: $vgpr1
-; GFX1150-FAKE16-NEXT: .LBB10_11: ; %frem.compute19
+; GFX1150-FAKE16-NEXT: .LBB10_11: ; %frem.compute52
; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v2, s9
; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s10
; GFX1150-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s10
@@ -10410,11 +10391,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5
; GFX1150-FAKE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; GFX1150-FAKE16-NEXT: s_cbranch_vccnz .LBB10_15
-; GFX1150-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; GFX1150-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body60.preheader
; GFX1150-FAKE16-NEXT: s_sub_i32 s9, s10, s9
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-FAKE16-NEXT: s_add_i32 s9, s9, 11
-; GFX1150-FAKE16-NEXT: .LBB10_13: ; %frem.loop_body27
+; GFX1150-FAKE16-NEXT: .LBB10_13: ; %frem.loop_body60
; GFX1150-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v6, v3
@@ -10436,7 +10417,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: ; %bb.14: ; %Flow129
; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v5, s9
; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v3, v6
-; GFX1150-FAKE16-NEXT: .LBB10_15: ; %frem.loop_exit28
+; GFX1150-FAKE16-NEXT: .LBB10_15: ; %frem.loop_exit61
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v5, -10, v5
; GFX1150-FAKE16-NEXT: v_ldexp_f32 v3, v3, v5
@@ -10463,7 +10444,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX1150-FAKE16-NEXT: s_cmp_ngt_f32 s10, s9
; GFX1150-FAKE16-NEXT: s_cbranch_scc0 .LBB10_18
-; GFX1150-FAKE16-NEXT: ; %bb.17: ; %frem.else53
+; GFX1150-FAKE16-NEXT: ; %bb.17: ; %frem.else20
; GFX1150-FAKE16-NEXT: s_cmp_eq_f32 s10, s9
; GFX1150-FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, 0, s7
; GFX1150-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -10473,7 +10454,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: s_branch .LBB10_24
; GFX1150-FAKE16-NEXT: .LBB10_18:
; GFX1150-FAKE16-NEXT: ; implicit-def: $vgpr2
-; GFX1150-FAKE16-NEXT: .LBB10_19: ; %frem.compute52
+; GFX1150-FAKE16-NEXT: .LBB10_19: ; %frem.compute19
; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v3, s9
; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v2, s10
; GFX1150-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v5, s10
@@ -10508,11 +10489,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v6
; GFX1150-FAKE16-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0
; GFX1150-FAKE16-NEXT: s_cbranch_vccnz .LBB10_23
-; GFX1150-FAKE16-NEXT: ; %bb.20: ; %frem.loop_body60.preheader
+; GFX1150-FAKE16-NEXT: ; %bb.20: ; %frem.loop_body27.preheader
; GFX1150-FAKE16-NEXT: s_sub_i32 s9, s10, s9
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-FAKE16-NEXT: s_add_i32 s9, s9, 11
-; GFX1150-FAKE16-NEXT: .LBB10_21: ; %frem.loop_body60
+; GFX1150-FAKE16-NEXT: .LBB10_21: ; %frem.loop_body27
; GFX1150-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v7, v4
@@ -10534,7 +10515,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: ; %bb.22: ; %Flow125
; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v6, s9
; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v4, v7
-; GFX1150-FAKE16-NEXT: .LBB10_23: ; %frem.loop_exit61
+; GFX1150-FAKE16-NEXT: .LBB10_23: ; %frem.loop_exit28
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v6, -10, v6
; GFX1150-FAKE16-NEXT: v_ldexp_f32 v4, v4, v6
@@ -10563,7 +10544,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX1150-FAKE16-NEXT: s_cmp_ngt_f32 s12, s11
; GFX1150-FAKE16-NEXT: s_cbranch_scc0 .LBB10_26
-; GFX1150-FAKE16-NEXT: ; %bb.25: ; %frem.else86
+; GFX1150-FAKE16-NEXT: ; %bb.25: ; %frem.else
; GFX1150-FAKE16-NEXT: s_cmp_eq_f32 s12, s11
; GFX1150-FAKE16-NEXT: v_bfi_b32 v3, 0x7fff, 0, s10
; GFX1150-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -10573,7 +10554,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: s_branch .LBB10_32
; GFX1150-FAKE16-NEXT: .LBB10_26:
; GFX1150-FAKE16-NEXT: ; implicit-def: $vgpr3
-; GFX1150-FAKE16-NEXT: .LBB10_27: ; %frem.compute85
+; GFX1150-FAKE16-NEXT: .LBB10_27: ; %frem.compute
; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v4, s11
; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v3, s12
; GFX1150-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v6, s12
@@ -10608,11 +10589,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v7
; GFX1150-FAKE16-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0
; GFX1150-FAKE16-NEXT: s_cbranch_vccnz .LBB10_31
-; GFX1150-FAKE16-NEXT: ; %bb.28: ; %frem.loop_body93.preheader
+; GFX1150-FAKE16-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; GFX1150-FAKE16-NEXT: s_sub_i32 s11, s12, s11
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-FAKE16-NEXT: s_add_i32 s11, s11, 11
-; GFX1150-FAKE16-NEXT: .LBB10_29: ; %frem.loop_body93
+; GFX1150-FAKE16-NEXT: .LBB10_29: ; %frem.loop_body
; GFX1150-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v8, v5
@@ -10634,7 +10615,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: ; %bb.30: ; %Flow
; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v7, s11
; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v5, v8
-; GFX1150-FAKE16-NEXT: .LBB10_31: ; %frem.loop_exit94
+; GFX1150-FAKE16-NEXT: .LBB10_31: ; %frem.loop_exit
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v7, -10, v7
; GFX1150-FAKE16-NEXT: v_ldexp_f32 v5, v5, v7
@@ -10712,7 +10693,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s8, s6
; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB10_2
-; GFX1200-TRUE16-NEXT: ; %bb.1: ; %frem.else
+; GFX1200-TRUE16-NEXT: ; %bb.1: ; %frem.else86
; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v0.l, s5
; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s8, s6
@@ -10724,7 +10705,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: s_branch .LBB10_8
; GFX1200-TRUE16-NEXT: .LBB10_2:
; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr0
-; GFX1200-TRUE16-NEXT: .LBB10_3: ; %frem.compute
+; GFX1200-TRUE16-NEXT: .LBB10_3: ; %frem.compute85
; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s6
; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v0, s8
; GFX1200-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s8
@@ -10759,11 +10740,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4
; GFX1200-TRUE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; GFX1200-TRUE16-NEXT: s_cbranch_vccnz .LBB10_7
-; GFX1200-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX1200-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body93.preheader
; GFX1200-TRUE16-NEXT: s_sub_co_i32 s6, s8, s6
; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX1200-TRUE16-NEXT: s_add_co_i32 s6, s6, 11
-; GFX1200-TRUE16-NEXT: .LBB10_5: ; %frem.loop_body
+; GFX1200-TRUE16-NEXT: .LBB10_5: ; %frem.loop_body93
; GFX1200-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v5, v2
@@ -10787,7 +10768,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: ; %bb.6: ; %Flow133
; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v4, s6
; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, v5
-; GFX1200-TRUE16-NEXT: .LBB10_7: ; %frem.loop_exit
+; GFX1200-TRUE16-NEXT: .LBB10_7: ; %frem.loop_exit94
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v4, -10, v4
; GFX1200-TRUE16-NEXT: v_ldexp_f32 v2, v2, v4
@@ -10821,7 +10802,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s10, s9
; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB10_10
-; GFX1200-TRUE16-NEXT: ; %bb.9: ; %frem.else20
+; GFX1200-TRUE16-NEXT: ; %bb.9: ; %frem.else53
; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, s8
; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s10, s9
@@ -10833,7 +10814,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: s_branch .LBB10_16
; GFX1200-TRUE16-NEXT: .LBB10_10:
; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr1
-; GFX1200-TRUE16-NEXT: .LBB10_11: ; %frem.compute19
+; GFX1200-TRUE16-NEXT: .LBB10_11: ; %frem.compute52
; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v2, s9
; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s10
; GFX1200-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s10
@@ -10869,11 +10850,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5
; GFX1200-TRUE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; GFX1200-TRUE16-NEXT: s_cbranch_vccnz .LBB10_15
-; GFX1200-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; GFX1200-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body60.preheader
; GFX1200-TRUE16-NEXT: s_sub_co_i32 s9, s10, s9
; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX1200-TRUE16-NEXT: s_add_co_i32 s9, s9, 11
-; GFX1200-TRUE16-NEXT: .LBB10_13: ; %frem.loop_body27
+; GFX1200-TRUE16-NEXT: .LBB10_13: ; %frem.loop_body60
; GFX1200-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v6, v3
@@ -10897,7 +10878,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: ; %bb.14: ; %Flow129
; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v5, s9
; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v3, v6
-; GFX1200-TRUE16-NEXT: .LBB10_15: ; %frem.loop_exit28
+; GFX1200-TRUE16-NEXT: .LBB10_15: ; %frem.loop_exit61
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v5, -10, v5
; GFX1200-TRUE16-NEXT: v_ldexp_f32 v3, v3, v5
@@ -10928,7 +10909,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s10, s9
; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB10_18
-; GFX1200-TRUE16-NEXT: ; %bb.17: ; %frem.else53
+; GFX1200-TRUE16-NEXT: ; %bb.17: ; %frem.else20
; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v2.l, s7
; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s10, s9
@@ -10941,7 +10922,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: s_branch .LBB10_24
; GFX1200-TRUE16-NEXT: .LBB10_18:
; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr2
-; GFX1200-TRUE16-NEXT: .LBB10_19: ; %frem.compute52
+; GFX1200-TRUE16-NEXT: .LBB10_19: ; %frem.compute19
; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v3, s9
; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v2, s10
; GFX1200-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v5, s10
@@ -10977,11 +10958,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v6
; GFX1200-TRUE16-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0
; GFX1200-TRUE16-NEXT: s_cbranch_vccnz .LBB10_23
-; GFX1200-TRUE16-NEXT: ; %bb.20: ; %frem.loop_body60.preheader
+; GFX1200-TRUE16-NEXT: ; %bb.20: ; %frem.loop_body27.preheader
; GFX1200-TRUE16-NEXT: s_sub_co_i32 s9, s10, s9
; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX1200-TRUE16-NEXT: s_add_co_i32 s9, s9, 11
-; GFX1200-TRUE16-NEXT: .LBB10_21: ; %frem.loop_body60
+; GFX1200-TRUE16-NEXT: .LBB10_21: ; %frem.loop_body27
; GFX1200-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v7, v4
@@ -11005,7 +10986,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: ; %bb.22: ; %Flow125
; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v6, s9
; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v4, v7
-; GFX1200-TRUE16-NEXT: .LBB10_23: ; %frem.loop_exit61
+; GFX1200-TRUE16-NEXT: .LBB10_23: ; %frem.loop_exit28
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v6, -10, v6
; GFX1200-TRUE16-NEXT: v_ldexp_f32 v4, v4, v6
@@ -11039,7 +11020,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s12, s11
; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB10_26
-; GFX1200-TRUE16-NEXT: ; %bb.25: ; %frem.else86
+; GFX1200-TRUE16-NEXT: ; %bb.25: ; %frem.else
; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v3.l, s10
; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s12, s11
@@ -11051,7 +11032,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: s_branch .LBB10_32
; GFX1200-TRUE16-NEXT: .LBB10_26:
; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr3
-; GFX1200-TRUE16-NEXT: .LBB10_27: ; %frem.compute85
+; GFX1200-TRUE16-NEXT: .LBB10_27: ; %frem.compute
; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v4, s11
; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v3, s12
; GFX1200-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v6, s12
@@ -11087,11 +11068,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v7
; GFX1200-TRUE16-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0
; GFX1200-TRUE16-NEXT: s_cbranch_vccnz .LBB10_31
-; GFX1200-TRUE16-NEXT: ; %bb.28: ; %frem.loop_body93.preheader
+; GFX1200-TRUE16-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; GFX1200-TRUE16-NEXT: s_sub_co_i32 s11, s12, s11
; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX1200-TRUE16-NEXT: s_add_co_i32 s11, s11, 11
-; GFX1200-TRUE16-NEXT: .LBB10_29: ; %frem.loop_body93
+; GFX1200-TRUE16-NEXT: .LBB10_29: ; %frem.loop_body
; GFX1200-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v8, v5
@@ -11115,7 +11096,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: ; %bb.30: ; %Flow
; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v7, s11
; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v5, v8
-; GFX1200-TRUE16-NEXT: .LBB10_31: ; %frem.loop_exit94
+; GFX1200-TRUE16-NEXT: .LBB10_31: ; %frem.loop_exit
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v7, -10, v7
; GFX1200-TRUE16-NEXT: v_ldexp_f32 v5, v5, v7
@@ -11147,18 +11128,14 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s3
; GFX1200-TRUE16-NEXT: s_cselect_b32 s3, -1, 0
; GFX1200-TRUE16-NEXT: s_cmp_nge_f16 s5, 0x7c00
-; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX1200-TRUE16-NEXT: s_cselect_b32 s4, -1, 0
; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX1200-TRUE16-NEXT: s_and_b32 s3, s4, s3
; GFX1200-TRUE16-NEXT: s_cmp_lg_f16 s2, 0
; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX1200-TRUE16-NEXT: v_cndmask_b16 v4.l, 0x7e00, v1.l, s3
+; GFX1200-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v1.l, s3
; GFX1200-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
; GFX1200-TRUE16-NEXT: s_cmp_nge_f16 s8, 0x7c00
-; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_lshl_or_b32 v0, v4, 16, v0
; GFX1200-TRUE16-NEXT: s_cselect_b32 s3, -1, 0
; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX1200-TRUE16-NEXT: s_and_b32 s2, s3, s2
@@ -11168,15 +11145,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, 0
; GFX1200-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
; GFX1200-TRUE16-NEXT: s_cmp_nge_f16 s7, 0x7c00
-; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1200-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX1200-TRUE16-NEXT: s_cselect_b32 s3, -1, 0
; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX1200-TRUE16-NEXT: s_and_b32 s2, s3, s2
; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX1200-TRUE16-NEXT: v_cndmask_b16 v3.l, 0x7e00, v3.l, s2
-; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_lshl_or_b32 v1, v3, 16, v1
+; GFX1200-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x7e00, v3.l, s2
; GFX1200-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX1200-TRUE16-NEXT: s_endpgm
;
@@ -11203,7 +11176,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX1200-FAKE16-NEXT: s_cmp_ngt_f32 s8, s6
; GFX1200-FAKE16-NEXT: s_cbranch_scc0 .LBB10_2
-; GFX1200-FAKE16-NEXT: ; %bb.1: ; %frem.else
+; GFX1200-FAKE16-NEXT: ; %bb.1: ; %frem.else86
; GFX1200-FAKE16-NEXT: s_cmp_eq_f32 s8, s6
; GFX1200-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, 0, s5
; GFX1200-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -11213,7 +11186,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: s_branch .LBB10_8
; GFX1200-FAKE16-NEXT: .LBB10_2:
; GFX1200-FAKE16-NEXT: ; implicit-def: $vgpr0
-; GFX1200-FAKE16-NEXT: .LBB10_3: ; %frem.compute
+; GFX1200-FAKE16-NEXT: .LBB10_3: ; %frem.compute85
; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s6
; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v0, s8
; GFX1200-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s8
@@ -11249,11 +11222,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4
; GFX1200-FAKE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; GFX1200-FAKE16-NEXT: s_cbranch_vccnz .LBB10_7
-; GFX1200-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX1200-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body93.preheader
; GFX1200-FAKE16-NEXT: s_sub_co_i32 s6, s8, s6
; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX1200-FAKE16-NEXT: s_add_co_i32 s6, s6, 11
-; GFX1200-FAKE16-NEXT: .LBB10_5: ; %frem.loop_body
+; GFX1200-FAKE16-NEXT: .LBB10_5: ; %frem.loop_body93
; GFX1200-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v5, v2
@@ -11277,7 +11250,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: ; %bb.6: ; %Flow133
; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v4, s6
; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v2, v5
-; GFX1200-FAKE16-NEXT: .LBB10_7: ; %frem.loop_exit
+; GFX1200-FAKE16-NEXT: .LBB10_7: ; %frem.loop_exit94
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v4, -10, v4
; GFX1200-FAKE16-NEXT: v_ldexp_f32 v2, v2, v4
@@ -11310,7 +11283,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
; GFX1200-FAKE16-NEXT: s_cmp_ngt_f32 s10, s9
; GFX1200-FAKE16-NEXT: s_cbranch_scc0 .LBB10_10
-; GFX1200-FAKE16-NEXT: ; %bb.9: ; %frem.else20
+; GFX1200-FAKE16-NEXT: ; %bb.9: ; %frem.else53
; GFX1200-FAKE16-NEXT: s_cmp_eq_f32 s10, s9
; GFX1200-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, 0, s8
; GFX1200-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -11321,7 +11294,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: s_branch .LBB10_16
; GFX1200-FAKE16-NEXT: .LBB10_10:
; GFX1200-FAKE16-NEXT: ; implicit-def: $vgpr1
-; GFX1200-FAKE16-NEXT: .LBB10_11: ; %frem.compute19
+; GFX1200-FAKE16-NEXT: .LBB10_11: ; %frem.compute52
; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v2, s9
; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s10
; GFX1200-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s10
@@ -11357,11 +11330,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5
; GFX1200-FAKE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; GFX1200-FAKE16-NEXT: s_cbranch_vccnz .LBB10_15
-; GFX1200-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; GFX1200-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body60.preheader
; GFX1200-FAKE16-NEXT: s_sub_co_i32 s9, s10, s9
; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX1200-FAKE16-NEXT: s_add_co_i32 s9, s9, 11
-; GFX1200-FAKE16-NEXT: .LBB10_13: ; %frem.loop_body27
+; GFX1200-FAKE16-NEXT: .LBB10_13: ; %frem.loop_body60
; GFX1200-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v6, v3
@@ -11385,7 +11358,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: ; %bb.14: ; %Flow129
; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v5, s9
; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v3, v6
-; GFX1200-FAKE16-NEXT: .LBB10_15: ; %frem.loop_exit28
+; GFX1200-FAKE16-NEXT: .LBB10_15: ; %frem.loop_exit61
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v5, -10, v5
; GFX1200-FAKE16-NEXT: v_ldexp_f32 v3, v3, v5
@@ -11415,7 +11388,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
; GFX1200-FAKE16-NEXT: s_cmp_ngt_f32 s10, s9
; GFX1200-FAKE16-NEXT: s_cbranch_scc0 .LBB10_18
-; GFX1200-FAKE16-NEXT: ; %bb.17: ; %frem.else53
+; GFX1200-FAKE16-NEXT: ; %bb.17: ; %frem.else20
; GFX1200-FAKE16-NEXT: s_cmp_eq_f32 s10, s9
; GFX1200-FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, 0, s7
; GFX1200-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -11426,7 +11399,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: s_branch .LBB10_24
; GFX1200-FAKE16-NEXT: .LBB10_18:
; GFX1200-FAKE16-NEXT: ; implicit-def: $vgpr2
-; GFX1200-FAKE16-NEXT: .LBB10_19: ; %frem.compute52
+; GFX1200-FAKE16-NEXT: .LBB10_19: ; %frem.compute19
; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v3, s9
; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v2, s10
; GFX1200-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v5, s10
@@ -11462,11 +11435,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v6
; GFX1200-FAKE16-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0
; GFX1200-FAKE16-NEXT: s_cbranch_vccnz .LBB10_23
-; GFX1200-FAKE16-NEXT: ; %bb.20: ; %frem.loop_body60.preheader
+; GFX1200-FAKE16-NEXT: ; %bb.20: ; %frem.loop_body27.preheader
; GFX1200-FAKE16-NEXT: s_sub_co_i32 s9, s10, s9
; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX1200-FAKE16-NEXT: s_add_co_i32 s9, s9, 11
-; GFX1200-FAKE16-NEXT: .LBB10_21: ; %frem.loop_body60
+; GFX1200-FAKE16-NEXT: .LBB10_21: ; %frem.loop_body27
; GFX1200-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v7, v4
@@ -11490,7 +11463,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: ; %bb.22: ; %Flow125
; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v6, s9
; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v4, v7
-; GFX1200-FAKE16-NEXT: .LBB10_23: ; %frem.loop_exit61
+; GFX1200-FAKE16-NEXT: .LBB10_23: ; %frem.loop_exit28
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v6, -10, v6
; GFX1200-FAKE16-NEXT: v_ldexp_f32 v4, v4, v6
@@ -11523,7 +11496,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
; GFX1200-FAKE16-NEXT: s_cmp_ngt_f32 s12, s11
; GFX1200-FAKE16-NEXT: s_cbranch_scc0 .LBB10_26
-; GFX1200-FAKE16-NEXT: ; %bb.25: ; %frem.else86
+; GFX1200-FAKE16-NEXT: ; %bb.25: ; %frem.else
; GFX1200-FAKE16-NEXT: s_cmp_eq_f32 s12, s11
; GFX1200-FAKE16-NEXT: v_bfi_b32 v3, 0x7fff, 0, s10
; GFX1200-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -11534,7 +11507,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: s_branch .LBB10_32
; GFX1200-FAKE16-NEXT: .LBB10_26:
; GFX1200-FAKE16-NEXT: ; implicit-def: $vgpr3
-; GFX1200-FAKE16-NEXT: .LBB10_27: ; %frem.compute85
+; GFX1200-FAKE16-NEXT: .LBB10_27: ; %frem.compute
; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v4, s11
; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v3, s12
; GFX1200-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v6, s12
@@ -11570,11 +11543,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v7
; GFX1200-FAKE16-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0
; GFX1200-FAKE16-NEXT: s_cbranch_vccnz .LBB10_31
-; GFX1200-FAKE16-NEXT: ; %bb.28: ; %frem.loop_body93.preheader
+; GFX1200-FAKE16-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; GFX1200-FAKE16-NEXT: s_sub_co_i32 s11, s12, s11
; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX1200-FAKE16-NEXT: s_add_co_i32 s11, s11, 11
-; GFX1200-FAKE16-NEXT: .LBB10_29: ; %frem.loop_body93
+; GFX1200-FAKE16-NEXT: .LBB10_29: ; %frem.loop_body
; GFX1200-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v8, v5
@@ -11598,7 +11571,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: ; %bb.30: ; %Flow
; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v7, s11
; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v5, v8
-; GFX1200-FAKE16-NEXT: .LBB10_31: ; %frem.loop_exit94
+; GFX1200-FAKE16-NEXT: .LBB10_31: ; %frem.loop_exit
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v7, -10, v7
; GFX1200-FAKE16-NEXT: v_ldexp_f32 v5, v5, v7
@@ -11686,7 +11659,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v2|
; SI-NEXT: s_and_b64 vcc, exec, s[2:3]
; SI-NEXT: s_cbranch_vccz .LBB11_2
-; SI-NEXT: ; %bb.1: ; %frem.else
+; SI-NEXT: ; %bb.1: ; %frem.else16
; SI-NEXT: s_brev_b32 s2, -2
; SI-NEXT: v_bfi_b32 v4, s2, 0, v0
; SI-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v2|
@@ -11697,7 +11670,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: .LBB11_2:
; SI-NEXT: ; implicit-def: $vgpr4
; SI-NEXT: s_mov_b64 vcc, 0
-; SI-NEXT: .LBB11_3: ; %frem.compute
+; SI-NEXT: .LBB11_3: ; %frem.compute15
; SI-NEXT: s_mov_b32 s6, 0x7f800000
; SI-NEXT: v_cmp_lt_f32_e64 s[2:3], |v0|, s6
; SI-NEXT: v_frexp_exp_i32_f32_e32 v4, v0
@@ -11733,10 +11706,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0
; SI-NEXT: s_cmp_lt_i32 s3, 13
; SI-NEXT: s_cbranch_scc1 .LBB11_7
-; SI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; SI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; SI-NEXT: s_sub_i32 s3, s4, s5
; SI-NEXT: s_add_i32 s3, s3, 12
-; SI-NEXT: .LBB11_5: ; %frem.loop_body
+; SI-NEXT: .LBB11_5: ; %frem.loop_body23
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: v_mov_b32_e32 v7, v5
; SI-NEXT: v_mul_f32_e32 v5, v7, v6
@@ -11751,7 +11724,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: s_cbranch_scc1 .LBB11_5
; SI-NEXT: ; %bb.6: ; %Flow51
; SI-NEXT: v_mov_b32_e32 v5, v7
-; SI-NEXT: .LBB11_7: ; %frem.loop_exit
+; SI-NEXT: .LBB11_7: ; %frem.loop_exit24
; SI-NEXT: s_add_i32 s3, s3, -11
; SI-NEXT: v_ldexp_f32_e64 v5, v5, s3
; SI-NEXT: v_mul_f32_e32 v6, v5, v6
@@ -11767,7 +11740,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v3|
; SI-NEXT: s_and_b64 vcc, exec, s[2:3]
; SI-NEXT: s_cbranch_vccz .LBB11_10
-; SI-NEXT: ; %bb.9: ; %frem.else16
+; SI-NEXT: ; %bb.9: ; %frem.else
; SI-NEXT: s_brev_b32 s2, -2
; SI-NEXT: v_bfi_b32 v5, s2, 0, v1
; SI-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v3|
@@ -11778,7 +11751,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: .LBB11_10:
; SI-NEXT: ; implicit-def: $vgpr5
; SI-NEXT: s_mov_b64 vcc, 0
-; SI-NEXT: .LBB11_11: ; %frem.compute15
+; SI-NEXT: .LBB11_11: ; %frem.compute
; SI-NEXT: s_mov_b32 s6, 0x7f800000
; SI-NEXT: v_cmp_lt_f32_e64 s[2:3], |v1|, s6
; SI-NEXT: v_frexp_exp_i32_f32_e32 v5, v1
@@ -11814,10 +11787,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0
; SI-NEXT: s_cmp_lt_i32 s3, 13
; SI-NEXT: s_cbranch_scc1 .LBB11_15
-; SI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; SI-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; SI-NEXT: s_sub_i32 s3, s4, s5
; SI-NEXT: s_add_i32 s3, s3, 12
-; SI-NEXT: .LBB11_13: ; %frem.loop_body23
+; SI-NEXT: .LBB11_13: ; %frem.loop_body
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: v_mov_b32_e32 v8, v6
; SI-NEXT: v_mul_f32_e32 v6, v8, v7
@@ -11832,7 +11805,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: s_cbranch_scc1 .LBB11_13
; SI-NEXT: ; %bb.14: ; %Flow
; SI-NEXT: v_mov_b32_e32 v6, v8
-; SI-NEXT: .LBB11_15: ; %frem.loop_exit24
+; SI-NEXT: .LBB11_15: ; %frem.loop_exit
; SI-NEXT: s_add_i32 s3, s3, -11
; SI-NEXT: v_ldexp_f32_e64 v6, v6, s3
; SI-NEXT: v_mul_f32_e32 v7, v6, v7
@@ -11877,7 +11850,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v2|
; CI-NEXT: s_and_b64 vcc, exec, s[2:3]
; CI-NEXT: s_cbranch_vccz .LBB11_2
-; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: ; %bb.1: ; %frem.else16
; CI-NEXT: s_brev_b32 s2, -2
; CI-NEXT: v_bfi_b32 v4, s2, 0, v0
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v2|
@@ -11886,7 +11859,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB11_8
; CI-NEXT: .LBB11_2:
; CI-NEXT: ; implicit-def: $vgpr4
-; CI-NEXT: .LBB11_3: ; %frem.compute
+; CI-NEXT: .LBB11_3: ; %frem.compute15
; CI-NEXT: v_frexp_mant_f32_e64 v5, |v2|
; CI-NEXT: v_ldexp_f32_e64 v5, v5, 1
; CI-NEXT: v_div_scale_f32 v11, s[2:3], v5, v5, 1.0
@@ -11911,10 +11884,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v6
; CI-NEXT: v_div_fixup_f32 v8, v8, v5, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB11_7
-; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; CI-NEXT: v_sub_i32_e32 v6, vcc, v9, v10
; CI-NEXT: v_add_i32_e32 v6, vcc, 12, v6
-; CI-NEXT: .LBB11_5: ; %frem.loop_body
+; CI-NEXT: .LBB11_5: ; %frem.loop_body23
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v9, v7
; CI-NEXT: v_mul_f32_e32 v7, v9, v8
@@ -11929,7 +11902,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_cbranch_vccnz .LBB11_5
; CI-NEXT: ; %bb.6: ; %Flow51
; CI-NEXT: v_mov_b32_e32 v7, v9
-; CI-NEXT: .LBB11_7: ; %frem.loop_exit
+; CI-NEXT: .LBB11_7: ; %frem.loop_exit24
; CI-NEXT: v_add_i32_e32 v6, vcc, -11, v6
; CI-NEXT: v_ldexp_f32_e32 v6, v7, v6
; CI-NEXT: v_mul_f32_e32 v7, v6, v8
@@ -11945,7 +11918,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v3|
; CI-NEXT: s_and_b64 vcc, exec, s[2:3]
; CI-NEXT: s_cbranch_vccz .LBB11_10
-; CI-NEXT: ; %bb.9: ; %frem.else16
+; CI-NEXT: ; %bb.9: ; %frem.else
; CI-NEXT: s_brev_b32 s2, -2
; CI-NEXT: v_bfi_b32 v5, s2, 0, v1
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v3|
@@ -11954,7 +11927,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB11_16
; CI-NEXT: .LBB11_10:
; CI-NEXT: ; implicit-def: $vgpr5
-; CI-NEXT: .LBB11_11: ; %frem.compute15
+; CI-NEXT: .LBB11_11: ; %frem.compute
; CI-NEXT: v_frexp_mant_f32_e64 v6, |v3|
; CI-NEXT: v_ldexp_f32_e64 v6, v6, 1
; CI-NEXT: v_div_scale_f32 v12, s[2:3], v6, v6, 1.0
@@ -11979,10 +11952,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v7
; CI-NEXT: v_div_fixup_f32 v9, v9, v6, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB11_15
-; CI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; CI-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; CI-NEXT: v_sub_i32_e32 v7, vcc, v10, v11
; CI-NEXT: v_add_i32_e32 v7, vcc, 12, v7
-; CI-NEXT: .LBB11_13: ; %frem.loop_body23
+; CI-NEXT: .LBB11_13: ; %frem.loop_body
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v10, v8
; CI-NEXT: v_mul_f32_e32 v8, v10, v9
@@ -11997,7 +11970,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_cbranch_vccnz .LBB11_13
; CI-NEXT: ; %bb.14: ; %Flow
; CI-NEXT: v_mov_b32_e32 v8, v10
-; CI-NEXT: .LBB11_15: ; %frem.loop_exit24
+; CI-NEXT: .LBB11_15: ; %frem.loop_exit
; CI-NEXT: v_add_i32_e32 v7, vcc, -11, v7
; CI-NEXT: v_ldexp_f32_e32 v7, v8, v7
; CI-NEXT: v_mul_f32_e32 v8, v7, v9
@@ -12042,7 +12015,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v2|
; VI-NEXT: s_and_b64 vcc, exec, s[2:3]
; VI-NEXT: s_cbranch_vccz .LBB11_2
-; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: ; %bb.1: ; %frem.else16
; VI-NEXT: s_brev_b32 s2, -2
; VI-NEXT: v_bfi_b32 v4, s2, 0, v0
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v2|
@@ -12051,7 +12024,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB11_8
; VI-NEXT: .LBB11_2:
; VI-NEXT: ; implicit-def: $vgpr4
-; VI-NEXT: .LBB11_3: ; %frem.compute
+; VI-NEXT: .LBB11_3: ; %frem.compute15
; VI-NEXT: v_frexp_mant_f32_e64 v5, |v2|
; VI-NEXT: v_ldexp_f32 v5, v5, 1
; VI-NEXT: v_div_scale_f32 v11, s[2:3], v5, v5, 1.0
@@ -12076,10 +12049,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v6
; VI-NEXT: v_div_fixup_f32 v8, v8, v5, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB11_7
-; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; VI-NEXT: v_sub_u32_e32 v6, vcc, v9, v10
; VI-NEXT: v_add_u32_e32 v6, vcc, 12, v6
-; VI-NEXT: .LBB11_5: ; %frem.loop_body
+; VI-NEXT: .LBB11_5: ; %frem.loop_body23
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v9, v7
; VI-NEXT: v_mul_f32_e32 v7, v9, v8
@@ -12094,7 +12067,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_cbranch_vccnz .LBB11_5
; VI-NEXT: ; %bb.6: ; %Flow51
; VI-NEXT: v_mov_b32_e32 v7, v9
-; VI-NEXT: .LBB11_7: ; %frem.loop_exit
+; VI-NEXT: .LBB11_7: ; %frem.loop_exit24
; VI-NEXT: v_add_u32_e32 v6, vcc, -11, v6
; VI-NEXT: v_ldexp_f32 v6, v7, v6
; VI-NEXT: v_mul_f32_e32 v7, v6, v8
@@ -12110,7 +12083,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v3|
; VI-NEXT: s_and_b64 vcc, exec, s[2:3]
; VI-NEXT: s_cbranch_vccz .LBB11_10
-; VI-NEXT: ; %bb.9: ; %frem.else16
+; VI-NEXT: ; %bb.9: ; %frem.else
; VI-NEXT: s_brev_b32 s2, -2
; VI-NEXT: v_bfi_b32 v5, s2, 0, v1
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v3|
@@ -12119,7 +12092,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB11_16
; VI-NEXT: .LBB11_10:
; VI-NEXT: ; implicit-def: $vgpr5
-; VI-NEXT: .LBB11_11: ; %frem.compute15
+; VI-NEXT: .LBB11_11: ; %frem.compute
; VI-NEXT: v_frexp_mant_f32_e64 v6, |v3|
; VI-NEXT: v_ldexp_f32 v6, v6, 1
; VI-NEXT: v_div_scale_f32 v12, s[2:3], v6, v6, 1.0
@@ -12144,10 +12117,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v7
; VI-NEXT: v_div_fixup_f32 v9, v9, v6, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB11_15
-; VI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; VI-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; VI-NEXT: v_sub_u32_e32 v7, vcc, v10, v11
; VI-NEXT: v_add_u32_e32 v7, vcc, 12, v7
-; VI-NEXT: .LBB11_13: ; %frem.loop_body23
+; VI-NEXT: .LBB11_13: ; %frem.loop_body
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v10, v8
; VI-NEXT: v_mul_f32_e32 v8, v10, v9
@@ -12162,7 +12135,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_cbranch_vccnz .LBB11_13
; VI-NEXT: ; %bb.14: ; %Flow
; VI-NEXT: v_mov_b32_e32 v8, v10
-; VI-NEXT: .LBB11_15: ; %frem.loop_exit24
+; VI-NEXT: .LBB11_15: ; %frem.loop_exit
; VI-NEXT: v_add_u32_e32 v7, vcc, -11, v7
; VI-NEXT: v_ldexp_f32 v7, v8, v7
; VI-NEXT: v_mul_f32_e32 v8, v7, v9
@@ -12202,7 +12175,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v2|
; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3]
; GFX9-NEXT: s_cbranch_vccz .LBB11_2
-; GFX9-NEXT: ; %bb.1: ; %frem.else
+; GFX9-NEXT: ; %bb.1: ; %frem.else16
; GFX9-NEXT: s_brev_b32 s2, -2
; GFX9-NEXT: v_bfi_b32 v4, s2, 0, v0
; GFX9-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v2|
@@ -12211,7 +12184,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_branch .LBB11_8
; GFX9-NEXT: .LBB11_2:
; GFX9-NEXT: ; implicit-def: $vgpr4
-; GFX9-NEXT: .LBB11_3: ; %frem.compute
+; GFX9-NEXT: .LBB11_3: ; %frem.compute15
; GFX9-NEXT: v_frexp_mant_f32_e64 v5, |v2|
; GFX9-NEXT: v_ldexp_f32 v5, v5, 1
; GFX9-NEXT: v_div_scale_f32 v11, s[2:3], v5, v5, 1.0
@@ -12236,10 +12209,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 13, v6
; GFX9-NEXT: v_div_fixup_f32 v8, v8, v5, 1.0
; GFX9-NEXT: s_cbranch_vccnz .LBB11_7
-; GFX9-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX9-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; GFX9-NEXT: v_sub_u32_e32 v6, v9, v10
; GFX9-NEXT: v_add_u32_e32 v6, 12, v6
-; GFX9-NEXT: .LBB11_5: ; %frem.loop_body
+; GFX9-NEXT: .LBB11_5: ; %frem.loop_body23
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v9, v7
; GFX9-NEXT: v_mul_f32_e32 v7, v9, v8
@@ -12254,7 +12227,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_cbranch_vccnz .LBB11_5
; GFX9-NEXT: ; %bb.6: ; %Flow51
; GFX9-NEXT: v_mov_b32_e32 v7, v9
-; GFX9-NEXT: .LBB11_7: ; %frem.loop_exit
+; GFX9-NEXT: .LBB11_7: ; %frem.loop_exit24
; GFX9-NEXT: v_add_u32_e32 v6, -11, v6
; GFX9-NEXT: v_ldexp_f32 v6, v7, v6
; GFX9-NEXT: v_mul_f32_e32 v7, v6, v8
@@ -12270,7 +12243,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v3|
; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3]
; GFX9-NEXT: s_cbranch_vccz .LBB11_10
-; GFX9-NEXT: ; %bb.9: ; %frem.else16
+; GFX9-NEXT: ; %bb.9: ; %frem.else
; GFX9-NEXT: s_brev_b32 s2, -2
; GFX9-NEXT: v_bfi_b32 v5, s2, 0, v1
; GFX9-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v3|
@@ -12279,7 +12252,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_branch .LBB11_16
; GFX9-NEXT: .LBB11_10:
; GFX9-NEXT: ; implicit-def: $vgpr5
-; GFX9-NEXT: .LBB11_11: ; %frem.compute15
+; GFX9-NEXT: .LBB11_11: ; %frem.compute
; GFX9-NEXT: v_frexp_mant_f32_e64 v6, |v3|
; GFX9-NEXT: v_ldexp_f32 v6, v6, 1
; GFX9-NEXT: v_div_scale_f32 v12, s[2:3], v6, v6, 1.0
@@ -12304,10 +12277,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 13, v7
; GFX9-NEXT: v_div_fixup_f32 v9, v9, v6, 1.0
; GFX9-NEXT: s_cbranch_vccnz .LBB11_15
-; GFX9-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; GFX9-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; GFX9-NEXT: v_sub_u32_e32 v7, v10, v11
; GFX9-NEXT: v_add_u32_e32 v7, 12, v7
-; GFX9-NEXT: .LBB11_13: ; %frem.loop_body23
+; GFX9-NEXT: .LBB11_13: ; %frem.loop_body
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v10, v8
; GFX9-NEXT: v_mul_f32_e32 v8, v10, v9
@@ -12322,7 +12295,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_cbranch_vccnz .LBB11_13
; GFX9-NEXT: ; %bb.14: ; %Flow
; GFX9-NEXT: v_mov_b32_e32 v8, v10
-; GFX9-NEXT: .LBB11_15: ; %frem.loop_exit24
+; GFX9-NEXT: .LBB11_15: ; %frem.loop_exit
; GFX9-NEXT: v_add_u32_e32 v7, -11, v7
; GFX9-NEXT: v_ldexp_f32 v7, v8, v7
; GFX9-NEXT: v_mul_f32_e32 v8, v7, v9
@@ -12363,7 +12336,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_ngt_f32_e64 s2, |v0|, |v2|
; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX10-NEXT: s_cbranch_vccz .LBB11_2
-; GFX10-NEXT: ; %bb.1: ; %frem.else
+; GFX10-NEXT: ; %bb.1: ; %frem.else16
; GFX10-NEXT: v_bfi_b32 v4, 0x7fffffff, 0, v0
; GFX10-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v0|, |v2|
; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc_lo
@@ -12371,7 +12344,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: s_branch .LBB11_8
; GFX10-NEXT: .LBB11_2:
; GFX10-NEXT: ; implicit-def: $vgpr4
-; GFX10-NEXT: .LBB11_3: ; %frem.compute
+; GFX10-NEXT: .LBB11_3: ; %frem.compute15
; GFX10-NEXT: v_frexp_mant_f32_e64 v5, |v2|
; GFX10-NEXT: v_frexp_mant_f32_e64 v4, |v0|
; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v7, v0
@@ -12398,10 +12371,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v8
; GFX10-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0
; GFX10-NEXT: s_cbranch_vccnz .LBB11_7
-; GFX10-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX10-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; GFX10-NEXT: s_sub_i32 s2, s2, s3
; GFX10-NEXT: s_add_i32 s2, s2, 12
-; GFX10-NEXT: .LBB11_5: ; %frem.loop_body
+; GFX10-NEXT: .LBB11_5: ; %frem.loop_body23
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_mov_b32_e32 v9, v6
; GFX10-NEXT: s_add_i32 s2, s2, -12
@@ -12417,7 +12390,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: ; %bb.6: ; %Flow51
; GFX10-NEXT: v_mov_b32_e32 v8, s2
; GFX10-NEXT: v_mov_b32_e32 v6, v9
-; GFX10-NEXT: .LBB11_7: ; %frem.loop_exit
+; GFX10-NEXT: .LBB11_7: ; %frem.loop_exit24
; GFX10-NEXT: v_add_nc_u32_e32 v8, -11, v8
; GFX10-NEXT: v_ldexp_f32 v6, v6, v8
; GFX10-NEXT: v_mul_f32_e32 v7, v6, v7
@@ -12432,7 +12405,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_ngt_f32_e64 s2, |v1|, |v3|
; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX10-NEXT: s_cbranch_vccz .LBB11_10
-; GFX10-NEXT: ; %bb.9: ; %frem.else16
+; GFX10-NEXT: ; %bb.9: ; %frem.else
; GFX10-NEXT: v_bfi_b32 v5, 0x7fffffff, 0, v1
; GFX10-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v1|, |v3|
; GFX10-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc_lo
@@ -12440,7 +12413,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: s_branch .LBB11_16
; GFX10-NEXT: .LBB11_10:
; GFX10-NEXT: ; implicit-def: $vgpr5
-; GFX10-NEXT: .LBB11_11: ; %frem.compute15
+; GFX10-NEXT: .LBB11_11: ; %frem.compute
; GFX10-NEXT: v_frexp_mant_f32_e64 v6, |v3|
; GFX10-NEXT: v_frexp_mant_f32_e64 v5, |v1|
; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v8, v1
@@ -12467,10 +12440,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v9
; GFX10-NEXT: v_div_fixup_f32 v8, v8, v6, 1.0
; GFX10-NEXT: s_cbranch_vccnz .LBB11_15
-; GFX10-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; GFX10-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; GFX10-NEXT: s_sub_i32 s2, s2, s3
; GFX10-NEXT: s_add_i32 s2, s2, 12
-; GFX10-NEXT: .LBB11_13: ; %frem.loop_body23
+; GFX10-NEXT: .LBB11_13: ; %frem.loop_body
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_mov_b32_e32 v10, v7
; GFX10-NEXT: s_add_i32 s2, s2, -12
@@ -12486,7 +12459,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: ; %bb.14: ; %Flow
; GFX10-NEXT: v_mov_b32_e32 v9, s2
; GFX10-NEXT: v_mov_b32_e32 v7, v10
-; GFX10-NEXT: .LBB11_15: ; %frem.loop_exit24
+; GFX10-NEXT: .LBB11_15: ; %frem.loop_exit
; GFX10-NEXT: v_add_nc_u32_e32 v9, -11, v9
; GFX10-NEXT: v_ldexp_f32 v7, v7, v9
; GFX10-NEXT: v_mul_f32_e32 v8, v7, v8
@@ -12524,7 +12497,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: v_cmp_ngt_f32_e64 s2, |v0|, |v2|
; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX11-NEXT: s_cbranch_vccz .LBB11_2
-; GFX11-NEXT: ; %bb.1: ; %frem.else
+; GFX11-NEXT: ; %bb.1: ; %frem.else16
; GFX11-NEXT: v_bfi_b32 v4, 0x7fffffff, 0, v0
; GFX11-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v0|, |v2|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
@@ -12533,7 +12506,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: s_branch .LBB11_8
; GFX11-NEXT: .LBB11_2:
; GFX11-NEXT: ; implicit-def: $vgpr4
-; GFX11-NEXT: .LBB11_3: ; %frem.compute
+; GFX11-NEXT: .LBB11_3: ; %frem.compute15
; GFX11-NEXT: v_frexp_mant_f32_e64 v5, |v2|
; GFX11-NEXT: v_frexp_mant_f32_e64 v4, |v0|
; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v7, v0
@@ -12569,11 +12542,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0
; GFX11-NEXT: s_cbranch_vccnz .LBB11_7
-; GFX11-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX11-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; GFX11-NEXT: s_sub_i32 s2, s2, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_i32 s2, s2, 12
-; GFX11-NEXT: .LBB11_5: ; %frem.loop_body
+; GFX11-NEXT: .LBB11_5: ; %frem.loop_body23
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v9, v6
@@ -12593,7 +12566,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: ; %bb.6: ; %Flow51
; GFX11-NEXT: v_mov_b32_e32 v8, s2
; GFX11-NEXT: v_mov_b32_e32 v6, v9
-; GFX11-NEXT: .LBB11_7: ; %frem.loop_exit
+; GFX11-NEXT: .LBB11_7: ; %frem.loop_exit24
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_nc_u32_e32 v8, -11, v8
; GFX11-NEXT: v_ldexp_f32 v6, v6, v8
@@ -12613,7 +12586,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: v_cmp_ngt_f32_e64 s2, |v1|, |v3|
; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX11-NEXT: s_cbranch_vccz .LBB11_10
-; GFX11-NEXT: ; %bb.9: ; %frem.else16
+; GFX11-NEXT: ; %bb.9: ; %frem.else
; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, 0, v1
; GFX11-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v1|, |v3|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
@@ -12622,7 +12595,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: s_branch .LBB11_16
; GFX11-NEXT: .LBB11_10:
; GFX11-NEXT: ; implicit-def: $vgpr5
-; GFX11-NEXT: .LBB11_11: ; %frem.compute15
+; GFX11-NEXT: .LBB11_11: ; %frem.compute
; GFX11-NEXT: v_frexp_mant_f32_e64 v6, |v3|
; GFX11-NEXT: v_frexp_mant_f32_e64 v5, |v1|
; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v8, v1
@@ -12658,11 +12631,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_div_fixup_f32 v8, v8, v6, 1.0
; GFX11-NEXT: s_cbranch_vccnz .LBB11_15
-; GFX11-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; GFX11-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; GFX11-NEXT: s_sub_i32 s2, s2, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_i32 s2, s2, 12
-; GFX11-NEXT: .LBB11_13: ; %frem.loop_body23
+; GFX11-NEXT: .LBB11_13: ; %frem.loop_body
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v10, v7
@@ -12682,7 +12655,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: ; %bb.14: ; %Flow
; GFX11-NEXT: v_mov_b32_e32 v9, s2
; GFX11-NEXT: v_mov_b32_e32 v7, v10
-; GFX11-NEXT: .LBB11_15: ; %frem.loop_exit24
+; GFX11-NEXT: .LBB11_15: ; %frem.loop_exit
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_nc_u32_e32 v9, -11, v9
; GFX11-NEXT: v_ldexp_f32 v7, v7, v9
@@ -12730,7 +12703,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-NEXT: s_cmp_ngt_f32 s3, s8
; GFX1150-NEXT: s_cbranch_scc0 .LBB11_2
-; GFX1150-NEXT: ; %bb.1: ; %frem.else
+; GFX1150-NEXT: ; %bb.1: ; %frem.else16
; GFX1150-NEXT: s_cmp_eq_f32 s3, s8
; GFX1150-NEXT: v_bfi_b32 v0, 0x7fffffff, 0, s6
; GFX1150-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -12740,7 +12713,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: s_branch .LBB11_8
; GFX1150-NEXT: .LBB11_2:
; GFX1150-NEXT: ; implicit-def: $vgpr0
-; GFX1150-NEXT: .LBB11_3: ; %frem.compute
+; GFX1150-NEXT: .LBB11_3: ; %frem.compute15
; GFX1150-NEXT: v_frexp_mant_f32_e64 v1, |s4|
; GFX1150-NEXT: v_frexp_mant_f32_e64 v0, |s6|
; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v3, s6
@@ -12775,11 +12748,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v4
; GFX1150-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; GFX1150-NEXT: s_cbranch_vccnz .LBB11_7
-; GFX1150-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX1150-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; GFX1150-NEXT: s_sub_i32 s7, s7, s8
; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-NEXT: s_add_i32 s7, s7, 12
-; GFX1150-NEXT: .LBB11_5: ; %frem.loop_body
+; GFX1150-NEXT: .LBB11_5: ; %frem.loop_body23
; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-NEXT: v_mov_b32_e32 v5, v2
@@ -12801,7 +12774,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: ; %bb.6: ; %Flow51
; GFX1150-NEXT: v_mov_b32_e32 v4, s7
; GFX1150-NEXT: v_mov_b32_e32 v2, v5
-; GFX1150-NEXT: .LBB11_7: ; %frem.loop_exit
+; GFX1150-NEXT: .LBB11_7: ; %frem.loop_exit24
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-NEXT: v_add_nc_u32_e32 v4, -11, v4
; GFX1150-NEXT: v_ldexp_f32 v2, v2, v4
@@ -12824,7 +12797,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-NEXT: s_cmp_ngt_f32 s6, s8
; GFX1150-NEXT: s_cbranch_scc0 .LBB11_10
-; GFX1150-NEXT: ; %bb.9: ; %frem.else16
+; GFX1150-NEXT: ; %bb.9: ; %frem.else
; GFX1150-NEXT: s_cmp_eq_f32 s6, s8
; GFX1150-NEXT: v_bfi_b32 v1, 0x7fffffff, 0, s5
; GFX1150-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -12834,7 +12807,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: s_branch .LBB11_16
; GFX1150-NEXT: .LBB11_10:
; GFX1150-NEXT: ; implicit-def: $vgpr1
-; GFX1150-NEXT: .LBB11_11: ; %frem.compute15
+; GFX1150-NEXT: .LBB11_11: ; %frem.compute
; GFX1150-NEXT: v_frexp_mant_f32_e64 v2, |s2|
; GFX1150-NEXT: v_frexp_mant_f32_e64 v1, |s5|
; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v4, s5
@@ -12869,11 +12842,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v5
; GFX1150-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; GFX1150-NEXT: s_cbranch_vccnz .LBB11_15
-; GFX1150-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; GFX1150-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; GFX1150-NEXT: s_sub_i32 s7, s7, s8
; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-NEXT: s_add_i32 s7, s7, 12
-; GFX1150-NEXT: .LBB11_13: ; %frem.loop_body23
+; GFX1150-NEXT: .LBB11_13: ; %frem.loop_body
; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-NEXT: v_mov_b32_e32 v6, v3
@@ -12895,7 +12868,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: ; %bb.14: ; %Flow
; GFX1150-NEXT: v_mov_b32_e32 v5, s7
; GFX1150-NEXT: v_mov_b32_e32 v3, v6
-; GFX1150-NEXT: .LBB11_15: ; %frem.loop_exit24
+; GFX1150-NEXT: .LBB11_15: ; %frem.loop_exit
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-NEXT: v_add_nc_u32_e32 v5, -11, v5
; GFX1150-NEXT: v_ldexp_f32 v3, v3, v5
@@ -12950,7 +12923,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1200-NEXT: s_cmp_ngt_f32 s3, s8
; GFX1200-NEXT: s_cbranch_scc0 .LBB11_2
-; GFX1200-NEXT: ; %bb.1: ; %frem.else
+; GFX1200-NEXT: ; %bb.1: ; %frem.else16
; GFX1200-NEXT: s_cmp_eq_f32 s3, s8
; GFX1200-NEXT: v_bfi_b32 v0, 0x7fffffff, 0, s6
; GFX1200-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -12960,7 +12933,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: s_branch .LBB11_8
; GFX1200-NEXT: .LBB11_2:
; GFX1200-NEXT: ; implicit-def: $vgpr0
-; GFX1200-NEXT: .LBB11_3: ; %frem.compute
+; GFX1200-NEXT: .LBB11_3: ; %frem.compute15
; GFX1200-NEXT: v_frexp_mant_f32_e64 v1, |s4|
; GFX1200-NEXT: v_frexp_mant_f32_e64 v0, |s6|
; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v3, s6
@@ -12996,11 +12969,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v4
; GFX1200-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; GFX1200-NEXT: s_cbranch_vccnz .LBB11_7
-; GFX1200-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX1200-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; GFX1200-NEXT: s_sub_co_i32 s7, s7, s8
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_add_co_i32 s7, s7, 12
-; GFX1200-NEXT: .LBB11_5: ; %frem.loop_body
+; GFX1200-NEXT: .LBB11_5: ; %frem.loop_body23
; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-NEXT: v_mov_b32_e32 v5, v2
@@ -13024,7 +12997,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: ; %bb.6: ; %Flow51
; GFX1200-NEXT: v_mov_b32_e32 v4, s7
; GFX1200-NEXT: v_mov_b32_e32 v2, v5
-; GFX1200-NEXT: .LBB11_7: ; %frem.loop_exit
+; GFX1200-NEXT: .LBB11_7: ; %frem.loop_exit24
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-NEXT: v_add_nc_u32_e32 v4, -11, v4
; GFX1200-NEXT: v_ldexp_f32 v2, v2, v4
@@ -13048,7 +13021,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_cmp_ngt_f32 s6, s8
; GFX1200-NEXT: s_cbranch_scc0 .LBB11_10
-; GFX1200-NEXT: ; %bb.9: ; %frem.else16
+; GFX1200-NEXT: ; %bb.9: ; %frem.else
; GFX1200-NEXT: s_cmp_eq_f32 s6, s8
; GFX1200-NEXT: v_bfi_b32 v1, 0x7fffffff, 0, s5
; GFX1200-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -13059,7 +13032,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: s_branch .LBB11_16
; GFX1200-NEXT: .LBB11_10:
; GFX1200-NEXT: ; implicit-def: $vgpr1
-; GFX1200-NEXT: .LBB11_11: ; %frem.compute15
+; GFX1200-NEXT: .LBB11_11: ; %frem.compute
; GFX1200-NEXT: v_frexp_mant_f32_e64 v2, |s2|
; GFX1200-NEXT: v_frexp_mant_f32_e64 v1, |s5|
; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v4, s5
@@ -13095,11 +13068,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v5
; GFX1200-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; GFX1200-NEXT: s_cbranch_vccnz .LBB11_15
-; GFX1200-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; GFX1200-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; GFX1200-NEXT: s_sub_co_i32 s7, s7, s8
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_add_co_i32 s7, s7, 12
-; GFX1200-NEXT: .LBB11_13: ; %frem.loop_body23
+; GFX1200-NEXT: .LBB11_13: ; %frem.loop_body
; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-NEXT: v_mov_b32_e32 v6, v3
@@ -13123,7 +13096,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: ; %bb.14: ; %Flow
; GFX1200-NEXT: v_mov_b32_e32 v5, s7
; GFX1200-NEXT: v_mov_b32_e32 v3, v6
-; GFX1200-NEXT: .LBB11_15: ; %frem.loop_exit24
+; GFX1200-NEXT: .LBB11_15: ; %frem.loop_exit
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-NEXT: v_add_nc_u32_e32 v5, -11, v5
; GFX1200-NEXT: v_ldexp_f32 v3, v3, v5
@@ -13187,7 +13160,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v4|
; SI-NEXT: s_and_b64 vcc, exec, s[2:3]
; SI-NEXT: s_cbranch_vccz .LBB12_2
-; SI-NEXT: ; %bb.1: ; %frem.else
+; SI-NEXT: ; %bb.1: ; %frem.else78
; SI-NEXT: s_brev_b32 s2, -2
; SI-NEXT: v_bfi_b32 v8, s2, 0, v0
; SI-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v4|
@@ -13198,7 +13171,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: .LBB12_2:
; SI-NEXT: ; implicit-def: $vgpr8
; SI-NEXT: s_mov_b64 vcc, 0
-; SI-NEXT: .LBB12_3: ; %frem.compute
+; SI-NEXT: .LBB12_3: ; %frem.compute77
; SI-NEXT: s_mov_b32 s6, 0x7f800000
; SI-NEXT: v_cmp_lt_f32_e64 s[2:3], |v0|, s6
; SI-NEXT: v_frexp_exp_i32_f32_e32 v8, v0
@@ -13234,10 +13207,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_div_fixup_f32 v10, v10, v8, 1.0
; SI-NEXT: s_cmp_lt_i32 s3, 13
; SI-NEXT: s_cbranch_scc1 .LBB12_7
-; SI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; SI-NEXT: ; %bb.4: ; %frem.loop_body85.preheader
; SI-NEXT: s_sub_i32 s3, s4, s5
; SI-NEXT: s_add_i32 s3, s3, 12
-; SI-NEXT: .LBB12_5: ; %frem.loop_body
+; SI-NEXT: .LBB12_5: ; %frem.loop_body85
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: v_mov_b32_e32 v11, v9
; SI-NEXT: v_mul_f32_e32 v9, v11, v10
@@ -13252,7 +13225,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: s_cbranch_scc1 .LBB12_5
; SI-NEXT: ; %bb.6: ; %Flow125
; SI-NEXT: v_mov_b32_e32 v9, v11
-; SI-NEXT: .LBB12_7: ; %frem.loop_exit
+; SI-NEXT: .LBB12_7: ; %frem.loop_exit86
; SI-NEXT: s_add_i32 s3, s3, -11
; SI-NEXT: v_ldexp_f32_e64 v9, v9, s3
; SI-NEXT: v_mul_f32_e32 v10, v9, v10
@@ -13268,7 +13241,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v5|
; SI-NEXT: s_and_b64 vcc, exec, s[2:3]
; SI-NEXT: s_cbranch_vccz .LBB12_10
-; SI-NEXT: ; %bb.9: ; %frem.else16
+; SI-NEXT: ; %bb.9: ; %frem.else47
; SI-NEXT: s_brev_b32 s2, -2
; SI-NEXT: v_bfi_b32 v9, s2, 0, v1
; SI-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v5|
@@ -13279,7 +13252,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: .LBB12_10:
; SI-NEXT: ; implicit-def: $vgpr9
; SI-NEXT: s_mov_b64 vcc, 0
-; SI-NEXT: .LBB12_11: ; %frem.compute15
+; SI-NEXT: .LBB12_11: ; %frem.compute46
; SI-NEXT: s_mov_b32 s6, 0x7f800000
; SI-NEXT: v_cmp_lt_f32_e64 s[2:3], |v1|, s6
; SI-NEXT: v_frexp_exp_i32_f32_e32 v9, v1
@@ -13315,10 +13288,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_div_fixup_f32 v11, v11, v9, 1.0
; SI-NEXT: s_cmp_lt_i32 s3, 13
; SI-NEXT: s_cbranch_scc1 .LBB12_15
-; SI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; SI-NEXT: ; %bb.12: ; %frem.loop_body54.preheader
; SI-NEXT: s_sub_i32 s3, s4, s5
; SI-NEXT: s_add_i32 s3, s3, 12
-; SI-NEXT: .LBB12_13: ; %frem.loop_body23
+; SI-NEXT: .LBB12_13: ; %frem.loop_body54
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: v_mov_b32_e32 v12, v10
; SI-NEXT: v_mul_f32_e32 v10, v12, v11
@@ -13333,7 +13306,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: s_cbranch_scc1 .LBB12_13
; SI-NEXT: ; %bb.14: ; %Flow121
; SI-NEXT: v_mov_b32_e32 v10, v12
-; SI-NEXT: .LBB12_15: ; %frem.loop_exit24
+; SI-NEXT: .LBB12_15: ; %frem.loop_exit55
; SI-NEXT: s_add_i32 s3, s3, -11
; SI-NEXT: v_ldexp_f32_e64 v10, v10, s3
; SI-NEXT: v_mul_f32_e32 v11, v10, v11
@@ -13349,7 +13322,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v2|, |v6|
; SI-NEXT: s_and_b64 vcc, exec, s[2:3]
; SI-NEXT: s_cbranch_vccz .LBB12_18
-; SI-NEXT: ; %bb.17: ; %frem.else47
+; SI-NEXT: ; %bb.17: ; %frem.else16
; SI-NEXT: s_brev_b32 s2, -2
; SI-NEXT: v_bfi_b32 v10, s2, 0, v2
; SI-NEXT: v_cmp_eq_f32_e64 vcc, |v2|, |v6|
@@ -13360,7 +13333,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: .LBB12_18:
; SI-NEXT: ; implicit-def: $vgpr10
; SI-NEXT: s_mov_b64 vcc, 0
-; SI-NEXT: .LBB12_19: ; %frem.compute46
+; SI-NEXT: .LBB12_19: ; %frem.compute15
; SI-NEXT: s_mov_b32 s6, 0x7f800000
; SI-NEXT: v_cmp_lt_f32_e64 s[2:3], |v2|, s6
; SI-NEXT: v_frexp_exp_i32_f32_e32 v10, v2
@@ -13396,10 +13369,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_div_fixup_f32 v12, v12, v10, 1.0
; SI-NEXT: s_cmp_lt_i32 s3, 13
; SI-NEXT: s_cbranch_scc1 .LBB12_23
-; SI-NEXT: ; %bb.20: ; %frem.loop_body54.preheader
+; SI-NEXT: ; %bb.20: ; %frem.loop_body23.preheader
; SI-NEXT: s_sub_i32 s3, s4, s5
; SI-NEXT: s_add_i32 s3, s3, 12
-; SI-NEXT: .LBB12_21: ; %frem.loop_body54
+; SI-NEXT: .LBB12_21: ; %frem.loop_body23
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: v_mov_b32_e32 v13, v11
; SI-NEXT: v_mul_f32_e32 v11, v13, v12
@@ -13414,7 +13387,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: s_cbranch_scc1 .LBB12_21
; SI-NEXT: ; %bb.22: ; %Flow117
; SI-NEXT: v_mov_b32_e32 v11, v13
-; SI-NEXT: .LBB12_23: ; %frem.loop_exit55
+; SI-NEXT: .LBB12_23: ; %frem.loop_exit24
; SI-NEXT: s_add_i32 s3, s3, -11
; SI-NEXT: v_ldexp_f32_e64 v11, v11, s3
; SI-NEXT: v_mul_f32_e32 v12, v11, v12
@@ -13430,7 +13403,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v3|, |v7|
; SI-NEXT: s_and_b64 vcc, exec, s[2:3]
; SI-NEXT: s_cbranch_vccz .LBB12_26
-; SI-NEXT: ; %bb.25: ; %frem.else78
+; SI-NEXT: ; %bb.25: ; %frem.else
; SI-NEXT: s_brev_b32 s2, -2
; SI-NEXT: v_bfi_b32 v11, s2, 0, v3
; SI-NEXT: v_cmp_eq_f32_e64 vcc, |v3|, |v7|
@@ -13441,7 +13414,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: .LBB12_26:
; SI-NEXT: ; implicit-def: $vgpr11
; SI-NEXT: s_mov_b64 vcc, 0
-; SI-NEXT: .LBB12_27: ; %frem.compute77
+; SI-NEXT: .LBB12_27: ; %frem.compute
; SI-NEXT: s_mov_b32 s6, 0x7f800000
; SI-NEXT: v_cmp_lt_f32_e64 s[2:3], |v3|, s6
; SI-NEXT: v_frexp_exp_i32_f32_e32 v11, v3
@@ -13477,10 +13450,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_div_fixup_f32 v13, v13, v11, 1.0
; SI-NEXT: s_cmp_lt_i32 s3, 13
; SI-NEXT: s_cbranch_scc1 .LBB12_31
-; SI-NEXT: ; %bb.28: ; %frem.loop_body85.preheader
+; SI-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; SI-NEXT: s_sub_i32 s3, s4, s5
; SI-NEXT: s_add_i32 s3, s3, 12
-; SI-NEXT: .LBB12_29: ; %frem.loop_body85
+; SI-NEXT: .LBB12_29: ; %frem.loop_body
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: v_mov_b32_e32 v14, v12
; SI-NEXT: v_mul_f32_e32 v12, v14, v13
@@ -13495,7 +13468,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: s_cbranch_scc1 .LBB12_29
; SI-NEXT: ; %bb.30: ; %Flow
; SI-NEXT: v_mov_b32_e32 v12, v14
-; SI-NEXT: .LBB12_31: ; %frem.loop_exit86
+; SI-NEXT: .LBB12_31: ; %frem.loop_exit
; SI-NEXT: s_add_i32 s3, s3, -11
; SI-NEXT: v_ldexp_f32_e64 v12, v12, s3
; SI-NEXT: v_mul_f32_e32 v13, v12, v13
@@ -13548,7 +13521,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v4|
; CI-NEXT: s_and_b64 vcc, exec, s[2:3]
; CI-NEXT: s_cbranch_vccz .LBB12_2
-; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: ; %bb.1: ; %frem.else78
; CI-NEXT: s_brev_b32 s2, -2
; CI-NEXT: v_bfi_b32 v8, s2, 0, v0
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v4|
@@ -13557,7 +13530,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB12_8
; CI-NEXT: .LBB12_2:
; CI-NEXT: ; implicit-def: $vgpr8
-; CI-NEXT: .LBB12_3: ; %frem.compute
+; CI-NEXT: .LBB12_3: ; %frem.compute77
; CI-NEXT: v_frexp_mant_f32_e64 v9, |v4|
; CI-NEXT: v_ldexp_f32_e64 v9, v9, 1
; CI-NEXT: v_div_scale_f32 v15, s[2:3], v9, v9, 1.0
@@ -13582,10 +13555,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v10
; CI-NEXT: v_div_fixup_f32 v12, v12, v9, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB12_7
-; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: ; %bb.4: ; %frem.loop_body85.preheader
; CI-NEXT: v_sub_i32_e32 v10, vcc, v13, v14
; CI-NEXT: v_add_i32_e32 v10, vcc, 12, v10
-; CI-NEXT: .LBB12_5: ; %frem.loop_body
+; CI-NEXT: .LBB12_5: ; %frem.loop_body85
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v13, v11
; CI-NEXT: v_mul_f32_e32 v11, v13, v12
@@ -13600,7 +13573,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_cbranch_vccnz .LBB12_5
; CI-NEXT: ; %bb.6: ; %Flow125
; CI-NEXT: v_mov_b32_e32 v11, v13
-; CI-NEXT: .LBB12_7: ; %frem.loop_exit
+; CI-NEXT: .LBB12_7: ; %frem.loop_exit86
; CI-NEXT: v_add_i32_e32 v10, vcc, -11, v10
; CI-NEXT: v_ldexp_f32_e32 v10, v11, v10
; CI-NEXT: v_mul_f32_e32 v11, v10, v12
@@ -13616,7 +13589,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v5|
; CI-NEXT: s_and_b64 vcc, exec, s[2:3]
; CI-NEXT: s_cbranch_vccz .LBB12_10
-; CI-NEXT: ; %bb.9: ; %frem.else16
+; CI-NEXT: ; %bb.9: ; %frem.else47
; CI-NEXT: s_brev_b32 s2, -2
; CI-NEXT: v_bfi_b32 v9, s2, 0, v1
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v5|
@@ -13625,7 +13598,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB12_16
; CI-NEXT: .LBB12_10:
; CI-NEXT: ; implicit-def: $vgpr9
-; CI-NEXT: .LBB12_11: ; %frem.compute15
+; CI-NEXT: .LBB12_11: ; %frem.compute46
; CI-NEXT: v_frexp_mant_f32_e64 v10, |v5|
; CI-NEXT: v_ldexp_f32_e64 v10, v10, 1
; CI-NEXT: v_div_scale_f32 v16, s[2:3], v10, v10, 1.0
@@ -13650,10 +13623,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v11
; CI-NEXT: v_div_fixup_f32 v13, v13, v10, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB12_15
-; CI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; CI-NEXT: ; %bb.12: ; %frem.loop_body54.preheader
; CI-NEXT: v_sub_i32_e32 v11, vcc, v14, v15
; CI-NEXT: v_add_i32_e32 v11, vcc, 12, v11
-; CI-NEXT: .LBB12_13: ; %frem.loop_body23
+; CI-NEXT: .LBB12_13: ; %frem.loop_body54
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v14, v12
; CI-NEXT: v_mul_f32_e32 v12, v14, v13
@@ -13668,7 +13641,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_cbranch_vccnz .LBB12_13
; CI-NEXT: ; %bb.14: ; %Flow121
; CI-NEXT: v_mov_b32_e32 v12, v14
-; CI-NEXT: .LBB12_15: ; %frem.loop_exit24
+; CI-NEXT: .LBB12_15: ; %frem.loop_exit55
; CI-NEXT: v_add_i32_e32 v11, vcc, -11, v11
; CI-NEXT: v_ldexp_f32_e32 v11, v12, v11
; CI-NEXT: v_mul_f32_e32 v12, v11, v13
@@ -13684,7 +13657,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v2|, |v6|
; CI-NEXT: s_and_b64 vcc, exec, s[2:3]
; CI-NEXT: s_cbranch_vccz .LBB12_18
-; CI-NEXT: ; %bb.17: ; %frem.else47
+; CI-NEXT: ; %bb.17: ; %frem.else16
; CI-NEXT: s_brev_b32 s2, -2
; CI-NEXT: v_bfi_b32 v10, s2, 0, v2
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |v2|, |v6|
@@ -13693,7 +13666,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB12_24
; CI-NEXT: .LBB12_18:
; CI-NEXT: ; implicit-def: $vgpr10
-; CI-NEXT: .LBB12_19: ; %frem.compute46
+; CI-NEXT: .LBB12_19: ; %frem.compute15
; CI-NEXT: v_frexp_mant_f32_e64 v11, |v6|
; CI-NEXT: v_ldexp_f32_e64 v11, v11, 1
; CI-NEXT: v_div_scale_f32 v17, s[2:3], v11, v11, 1.0
@@ -13718,10 +13691,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v12
; CI-NEXT: v_div_fixup_f32 v14, v14, v11, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB12_23
-; CI-NEXT: ; %bb.20: ; %frem.loop_body54.preheader
+; CI-NEXT: ; %bb.20: ; %frem.loop_body23.preheader
; CI-NEXT: v_sub_i32_e32 v12, vcc, v15, v16
; CI-NEXT: v_add_i32_e32 v12, vcc, 12, v12
-; CI-NEXT: .LBB12_21: ; %frem.loop_body54
+; CI-NEXT: .LBB12_21: ; %frem.loop_body23
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v15, v13
; CI-NEXT: v_mul_f32_e32 v13, v15, v14
@@ -13736,7 +13709,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_cbranch_vccnz .LBB12_21
; CI-NEXT: ; %bb.22: ; %Flow117
; CI-NEXT: v_mov_b32_e32 v13, v15
-; CI-NEXT: .LBB12_23: ; %frem.loop_exit55
+; CI-NEXT: .LBB12_23: ; %frem.loop_exit24
; CI-NEXT: v_add_i32_e32 v12, vcc, -11, v12
; CI-NEXT: v_ldexp_f32_e32 v12, v13, v12
; CI-NEXT: v_mul_f32_e32 v13, v12, v14
@@ -13752,7 +13725,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v3|, |v7|
; CI-NEXT: s_and_b64 vcc, exec, s[2:3]
; CI-NEXT: s_cbranch_vccz .LBB12_26
-; CI-NEXT: ; %bb.25: ; %frem.else78
+; CI-NEXT: ; %bb.25: ; %frem.else
; CI-NEXT: s_brev_b32 s2, -2
; CI-NEXT: v_bfi_b32 v11, s2, 0, v3
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |v3|, |v7|
@@ -13761,7 +13734,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB12_32
; CI-NEXT: .LBB12_26:
; CI-NEXT: ; implicit-def: $vgpr11
-; CI-NEXT: .LBB12_27: ; %frem.compute77
+; CI-NEXT: .LBB12_27: ; %frem.compute
; CI-NEXT: v_frexp_mant_f32_e64 v12, |v7|
; CI-NEXT: v_ldexp_f32_e64 v12, v12, 1
; CI-NEXT: v_div_scale_f32 v18, s[2:3], v12, v12, 1.0
@@ -13786,10 +13759,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v13
; CI-NEXT: v_div_fixup_f32 v15, v15, v12, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB12_31
-; CI-NEXT: ; %bb.28: ; %frem.loop_body85.preheader
+; CI-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; CI-NEXT: v_sub_i32_e32 v13, vcc, v16, v17
; CI-NEXT: v_add_i32_e32 v13, vcc, 12, v13
-; CI-NEXT: .LBB12_29: ; %frem.loop_body85
+; CI-NEXT: .LBB12_29: ; %frem.loop_body
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v16, v14
; CI-NEXT: v_mul_f32_e32 v14, v16, v15
@@ -13804,7 +13777,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_cbranch_vccnz .LBB12_29
; CI-NEXT: ; %bb.30: ; %Flow
; CI-NEXT: v_mov_b32_e32 v14, v16
-; CI-NEXT: .LBB12_31: ; %frem.loop_exit86
+; CI-NEXT: .LBB12_31: ; %frem.loop_exit
; CI-NEXT: v_add_i32_e32 v13, vcc, -11, v13
; CI-NEXT: v_ldexp_f32_e32 v13, v14, v13
; CI-NEXT: v_mul_f32_e32 v14, v13, v15
@@ -13857,7 +13830,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v4|
; VI-NEXT: s_and_b64 vcc, exec, s[2:3]
; VI-NEXT: s_cbranch_vccz .LBB12_2
-; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: ; %bb.1: ; %frem.else78
; VI-NEXT: s_brev_b32 s2, -2
; VI-NEXT: v_bfi_b32 v8, s2, 0, v0
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v4|
@@ -13866,7 +13839,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB12_8
; VI-NEXT: .LBB12_2:
; VI-NEXT: ; implicit-def: $vgpr8
-; VI-NEXT: .LBB12_3: ; %frem.compute
+; VI-NEXT: .LBB12_3: ; %frem.compute77
; VI-NEXT: v_frexp_mant_f32_e64 v9, |v4|
; VI-NEXT: v_ldexp_f32 v9, v9, 1
; VI-NEXT: v_div_scale_f32 v15, s[2:3], v9, v9, 1.0
@@ -13891,10 +13864,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v10
; VI-NEXT: v_div_fixup_f32 v12, v12, v9, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB12_7
-; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: ; %bb.4: ; %frem.loop_body85.preheader
; VI-NEXT: v_sub_u32_e32 v10, vcc, v13, v14
; VI-NEXT: v_add_u32_e32 v10, vcc, 12, v10
-; VI-NEXT: .LBB12_5: ; %frem.loop_body
+; VI-NEXT: .LBB12_5: ; %frem.loop_body85
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v13, v11
; VI-NEXT: v_mul_f32_e32 v11, v13, v12
@@ -13909,7 +13882,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_cbranch_vccnz .LBB12_5
; VI-NEXT: ; %bb.6: ; %Flow125
; VI-NEXT: v_mov_b32_e32 v11, v13
-; VI-NEXT: .LBB12_7: ; %frem.loop_exit
+; VI-NEXT: .LBB12_7: ; %frem.loop_exit86
; VI-NEXT: v_add_u32_e32 v10, vcc, -11, v10
; VI-NEXT: v_ldexp_f32 v10, v11, v10
; VI-NEXT: v_mul_f32_e32 v11, v10, v12
@@ -13925,7 +13898,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v5|
; VI-NEXT: s_and_b64 vcc, exec, s[2:3]
; VI-NEXT: s_cbranch_vccz .LBB12_10
-; VI-NEXT: ; %bb.9: ; %frem.else16
+; VI-NEXT: ; %bb.9: ; %frem.else47
; VI-NEXT: s_brev_b32 s2, -2
; VI-NEXT: v_bfi_b32 v9, s2, 0, v1
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v5|
@@ -13934,7 +13907,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB12_16
; VI-NEXT: .LBB12_10:
; VI-NEXT: ; implicit-def: $vgpr9
-; VI-NEXT: .LBB12_11: ; %frem.compute15
+; VI-NEXT: .LBB12_11: ; %frem.compute46
; VI-NEXT: v_frexp_mant_f32_e64 v10, |v5|
; VI-NEXT: v_ldexp_f32 v10, v10, 1
; VI-NEXT: v_div_scale_f32 v16, s[2:3], v10, v10, 1.0
@@ -13959,10 +13932,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v11
; VI-NEXT: v_div_fixup_f32 v13, v13, v10, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB12_15
-; VI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; VI-NEXT: ; %bb.12: ; %frem.loop_body54.preheader
; VI-NEXT: v_sub_u32_e32 v11, vcc, v14, v15
; VI-NEXT: v_add_u32_e32 v11, vcc, 12, v11
-; VI-NEXT: .LBB12_13: ; %frem.loop_body23
+; VI-NEXT: .LBB12_13: ; %frem.loop_body54
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v14, v12
; VI-NEXT: v_mul_f32_e32 v12, v14, v13
@@ -13977,7 +13950,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_cbranch_vccnz .LBB12_13
; VI-NEXT: ; %bb.14: ; %Flow121
; VI-NEXT: v_mov_b32_e32 v12, v14
-; VI-NEXT: .LBB12_15: ; %frem.loop_exit24
+; VI-NEXT: .LBB12_15: ; %frem.loop_exit55
; VI-NEXT: v_add_u32_e32 v11, vcc, -11, v11
; VI-NEXT: v_ldexp_f32 v11, v12, v11
; VI-NEXT: v_mul_f32_e32 v12, v11, v13
@@ -13993,7 +13966,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v2|, |v6|
; VI-NEXT: s_and_b64 vcc, exec, s[2:3]
; VI-NEXT: s_cbranch_vccz .LBB12_18
-; VI-NEXT: ; %bb.17: ; %frem.else47
+; VI-NEXT: ; %bb.17: ; %frem.else16
; VI-NEXT: s_brev_b32 s2, -2
; VI-NEXT: v_bfi_b32 v10, s2, 0, v2
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |v2|, |v6|
@@ -14002,7 +13975,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB12_24
; VI-NEXT: .LBB12_18:
; VI-NEXT: ; implicit-def: $vgpr10
-; VI-NEXT: .LBB12_19: ; %frem.compute46
+; VI-NEXT: .LBB12_19: ; %frem.compute15
; VI-NEXT: v_frexp_mant_f32_e64 v11, |v6|
; VI-NEXT: v_ldexp_f32 v11, v11, 1
; VI-NEXT: v_div_scale_f32 v17, s[2:3], v11, v11, 1.0
@@ -14027,10 +14000,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v12
; VI-NEXT: v_div_fixup_f32 v14, v14, v11, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB12_23
-; VI-NEXT: ; %bb.20: ; %frem.loop_body54.preheader
+; VI-NEXT: ; %bb.20: ; %frem.loop_body23.preheader
; VI-NEXT: v_sub_u32_e32 v12, vcc, v15, v16
; VI-NEXT: v_add_u32_e32 v12, vcc, 12, v12
-; VI-NEXT: .LBB12_21: ; %frem.loop_body54
+; VI-NEXT: .LBB12_21: ; %frem.loop_body23
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v15, v13
; VI-NEXT: v_mul_f32_e32 v13, v15, v14
@@ -14045,7 +14018,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_cbranch_vccnz .LBB12_21
; VI-NEXT: ; %bb.22: ; %Flow117
; VI-NEXT: v_mov_b32_e32 v13, v15
-; VI-NEXT: .LBB12_23: ; %frem.loop_exit55
+; VI-NEXT: .LBB12_23: ; %frem.loop_exit24
; VI-NEXT: v_add_u32_e32 v12, vcc, -11, v12
; VI-NEXT: v_ldexp_f32 v12, v13, v12
; VI-NEXT: v_mul_f32_e32 v13, v12, v14
@@ -14061,7 +14034,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v3|, |v7|
; VI-NEXT: s_and_b64 vcc, exec, s[2:3]
; VI-NEXT: s_cbranch_vccz .LBB12_26
-; VI-NEXT: ; %bb.25: ; %frem.else78
+; VI-NEXT: ; %bb.25: ; %frem.else
; VI-NEXT: s_brev_b32 s2, -2
; VI-NEXT: v_bfi_b32 v11, s2, 0, v3
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |v3|, |v7|
@@ -14070,7 +14043,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB12_32
; VI-NEXT: .LBB12_26:
; VI-NEXT: ; implicit-def: $vgpr11
-; VI-NEXT: .LBB12_27: ; %frem.compute77
+; VI-NEXT: .LBB12_27: ; %frem.compute
; VI-NEXT: v_frexp_mant_f32_e64 v12, |v7|
; VI-NEXT: v_ldexp_f32 v12, v12, 1
; VI-NEXT: v_div_scale_f32 v18, s[2:3], v12, v12, 1.0
@@ -14095,10 +14068,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v13
; VI-NEXT: v_div_fixup_f32 v15, v15, v12, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB12_31
-; VI-NEXT: ; %bb.28: ; %frem.loop_body85.preheader
+; VI-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; VI-NEXT: v_sub_u32_e32 v13, vcc, v16, v17
; VI-NEXT: v_add_u32_e32 v13, vcc, 12, v13
-; VI-NEXT: .LBB12_29: ; %frem.loop_body85
+; VI-NEXT: .LBB12_29: ; %frem.loop_body
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v16, v14
; VI-NEXT: v_mul_f32_e32 v14, v16, v15
@@ -14113,7 +14086,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_cbranch_vccnz .LBB12_29
; VI-NEXT: ; %bb.30: ; %Flow
; VI-NEXT: v_mov_b32_e32 v14, v16
-; VI-NEXT: .LBB12_31: ; %frem.loop_exit86
+; VI-NEXT: .LBB12_31: ; %frem.loop_exit
; VI-NEXT: v_add_u32_e32 v13, vcc, -11, v13
; VI-NEXT: v_ldexp_f32 v13, v14, v13
; VI-NEXT: v_mul_f32_e32 v14, v13, v15
@@ -14161,7 +14134,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v4|
; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3]
; GFX9-NEXT: s_cbranch_vccz .LBB12_2
-; GFX9-NEXT: ; %bb.1: ; %frem.else
+; GFX9-NEXT: ; %bb.1: ; %frem.else78
; GFX9-NEXT: s_brev_b32 s2, -2
; GFX9-NEXT: v_bfi_b32 v8, s2, 0, v0
; GFX9-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v4|
@@ -14170,7 +14143,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_branch .LBB12_8
; GFX9-NEXT: .LBB12_2:
; GFX9-NEXT: ; implicit-def: $vgpr8
-; GFX9-NEXT: .LBB12_3: ; %frem.compute
+; GFX9-NEXT: .LBB12_3: ; %frem.compute77
; GFX9-NEXT: v_frexp_mant_f32_e64 v9, |v4|
; GFX9-NEXT: v_ldexp_f32 v9, v9, 1
; GFX9-NEXT: v_div_scale_f32 v15, s[2:3], v9, v9, 1.0
@@ -14195,10 +14168,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 13, v10
; GFX9-NEXT: v_div_fixup_f32 v12, v12, v9, 1.0
; GFX9-NEXT: s_cbranch_vccnz .LBB12_7
-; GFX9-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX9-NEXT: ; %bb.4: ; %frem.loop_body85.preheader
; GFX9-NEXT: v_sub_u32_e32 v10, v13, v14
; GFX9-NEXT: v_add_u32_e32 v10, 12, v10
-; GFX9-NEXT: .LBB12_5: ; %frem.loop_body
+; GFX9-NEXT: .LBB12_5: ; %frem.loop_body85
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v13, v11
; GFX9-NEXT: v_mul_f32_e32 v11, v13, v12
@@ -14213,7 +14186,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_cbranch_vccnz .LBB12_5
; GFX9-NEXT: ; %bb.6: ; %Flow125
; GFX9-NEXT: v_mov_b32_e32 v11, v13
-; GFX9-NEXT: .LBB12_7: ; %frem.loop_exit
+; GFX9-NEXT: .LBB12_7: ; %frem.loop_exit86
; GFX9-NEXT: v_add_u32_e32 v10, -11, v10
; GFX9-NEXT: v_ldexp_f32 v10, v11, v10
; GFX9-NEXT: v_mul_f32_e32 v11, v10, v12
@@ -14229,7 +14202,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v5|
; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3]
; GFX9-NEXT: s_cbranch_vccz .LBB12_10
-; GFX9-NEXT: ; %bb.9: ; %frem.else16
+; GFX9-NEXT: ; %bb.9: ; %frem.else47
; GFX9-NEXT: s_brev_b32 s2, -2
; GFX9-NEXT: v_bfi_b32 v9, s2, 0, v1
; GFX9-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v5|
@@ -14238,7 +14211,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_branch .LBB12_16
; GFX9-NEXT: .LBB12_10:
; GFX9-NEXT: ; implicit-def: $vgpr9
-; GFX9-NEXT: .LBB12_11: ; %frem.compute15
+; GFX9-NEXT: .LBB12_11: ; %frem.compute46
; GFX9-NEXT: v_frexp_mant_f32_e64 v10, |v5|
; GFX9-NEXT: v_ldexp_f32 v10, v10, 1
; GFX9-NEXT: v_div_scale_f32 v16, s[2:3], v10, v10, 1.0
@@ -14263,10 +14236,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 13, v11
; GFX9-NEXT: v_div_fixup_f32 v13, v13, v10, 1.0
; GFX9-NEXT: s_cbranch_vccnz .LBB12_15
-; GFX9-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; GFX9-NEXT: ; %bb.12: ; %frem.loop_body54.preheader
; GFX9-NEXT: v_sub_u32_e32 v11, v14, v15
; GFX9-NEXT: v_add_u32_e32 v11, 12, v11
-; GFX9-NEXT: .LBB12_13: ; %frem.loop_body23
+; GFX9-NEXT: .LBB12_13: ; %frem.loop_body54
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v14, v12
; GFX9-NEXT: v_mul_f32_e32 v12, v14, v13
@@ -14281,7 +14254,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_cbranch_vccnz .LBB12_13
; GFX9-NEXT: ; %bb.14: ; %Flow121
; GFX9-NEXT: v_mov_b32_e32 v12, v14
-; GFX9-NEXT: .LBB12_15: ; %frem.loop_exit24
+; GFX9-NEXT: .LBB12_15: ; %frem.loop_exit55
; GFX9-NEXT: v_add_u32_e32 v11, -11, v11
; GFX9-NEXT: v_ldexp_f32 v11, v12, v11
; GFX9-NEXT: v_mul_f32_e32 v12, v11, v13
@@ -14297,7 +14270,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v2|, |v6|
; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3]
; GFX9-NEXT: s_cbranch_vccz .LBB12_18
-; GFX9-NEXT: ; %bb.17: ; %frem.else47
+; GFX9-NEXT: ; %bb.17: ; %frem.else16
; GFX9-NEXT: s_brev_b32 s2, -2
; GFX9-NEXT: v_bfi_b32 v10, s2, 0, v2
; GFX9-NEXT: v_cmp_eq_f32_e64 vcc, |v2|, |v6|
@@ -14306,7 +14279,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_branch .LBB12_24
; GFX9-NEXT: .LBB12_18:
; GFX9-NEXT: ; implicit-def: $vgpr10
-; GFX9-NEXT: .LBB12_19: ; %frem.compute46
+; GFX9-NEXT: .LBB12_19: ; %frem.compute15
; GFX9-NEXT: v_frexp_mant_f32_e64 v11, |v6|
; GFX9-NEXT: v_ldexp_f32 v11, v11, 1
; GFX9-NEXT: v_div_scale_f32 v17, s[2:3], v11, v11, 1.0
@@ -14331,10 +14304,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 13, v12
; GFX9-NEXT: v_div_fixup_f32 v14, v14, v11, 1.0
; GFX9-NEXT: s_cbranch_vccnz .LBB12_23
-; GFX9-NEXT: ; %bb.20: ; %frem.loop_body54.preheader
+; GFX9-NEXT: ; %bb.20: ; %frem.loop_body23.preheader
; GFX9-NEXT: v_sub_u32_e32 v12, v15, v16
; GFX9-NEXT: v_add_u32_e32 v12, 12, v12
-; GFX9-NEXT: .LBB12_21: ; %frem.loop_body54
+; GFX9-NEXT: .LBB12_21: ; %frem.loop_body23
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v15, v13
; GFX9-NEXT: v_mul_f32_e32 v13, v15, v14
@@ -14349,7 +14322,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_cbranch_vccnz .LBB12_21
; GFX9-NEXT: ; %bb.22: ; %Flow117
; GFX9-NEXT: v_mov_b32_e32 v13, v15
-; GFX9-NEXT: .LBB12_23: ; %frem.loop_exit55
+; GFX9-NEXT: .LBB12_23: ; %frem.loop_exit24
; GFX9-NEXT: v_add_u32_e32 v12, -11, v12
; GFX9-NEXT: v_ldexp_f32 v12, v13, v12
; GFX9-NEXT: v_mul_f32_e32 v13, v12, v14
@@ -14365,7 +14338,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v3|, |v7|
; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3]
; GFX9-NEXT: s_cbranch_vccz .LBB12_26
-; GFX9-NEXT: ; %bb.25: ; %frem.else78
+; GFX9-NEXT: ; %bb.25: ; %frem.else
; GFX9-NEXT: s_brev_b32 s2, -2
; GFX9-NEXT: v_bfi_b32 v11, s2, 0, v3
; GFX9-NEXT: v_cmp_eq_f32_e64 vcc, |v3|, |v7|
@@ -14374,7 +14347,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_branch .LBB12_32
; GFX9-NEXT: .LBB12_26:
; GFX9-NEXT: ; implicit-def: $vgpr11
-; GFX9-NEXT: .LBB12_27: ; %frem.compute77
+; GFX9-NEXT: .LBB12_27: ; %frem.compute
; GFX9-NEXT: v_frexp_mant_f32_e64 v12, |v7|
; GFX9-NEXT: v_ldexp_f32 v12, v12, 1
; GFX9-NEXT: v_div_scale_f32 v18, s[2:3], v12, v12, 1.0
@@ -14399,10 +14372,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 13, v13
; GFX9-NEXT: v_div_fixup_f32 v15, v15, v12, 1.0
; GFX9-NEXT: s_cbranch_vccnz .LBB12_31
-; GFX9-NEXT: ; %bb.28: ; %frem.loop_body85.preheader
+; GFX9-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; GFX9-NEXT: v_sub_u32_e32 v13, v16, v17
; GFX9-NEXT: v_add_u32_e32 v13, 12, v13
-; GFX9-NEXT: .LBB12_29: ; %frem.loop_body85
+; GFX9-NEXT: .LBB12_29: ; %frem.loop_body
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v16, v14
; GFX9-NEXT: v_mul_f32_e32 v14, v16, v15
@@ -14417,7 +14390,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_cbranch_vccnz .LBB12_29
; GFX9-NEXT: ; %bb.30: ; %Flow
; GFX9-NEXT: v_mov_b32_e32 v14, v16
-; GFX9-NEXT: .LBB12_31: ; %frem.loop_exit86
+; GFX9-NEXT: .LBB12_31: ; %frem.loop_exit
; GFX9-NEXT: v_add_u32_e32 v13, -11, v13
; GFX9-NEXT: v_ldexp_f32 v13, v14, v13
; GFX9-NEXT: v_mul_f32_e32 v14, v13, v15
@@ -14466,7 +14439,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_ngt_f32_e64 s2, |v0|, |v4|
; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX10-NEXT: s_cbranch_vccz .LBB12_2
-; GFX10-NEXT: ; %bb.1: ; %frem.else
+; GFX10-NEXT: ; %bb.1: ; %frem.else78
; GFX10-NEXT: v_bfi_b32 v8, 0x7fffffff, 0, v0
; GFX10-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v0|, |v4|
; GFX10-NEXT: v_cndmask_b32_e32 v8, v0, v8, vcc_lo
@@ -14474,7 +14447,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: s_branch .LBB12_8
; GFX10-NEXT: .LBB12_2:
; GFX10-NEXT: ; implicit-def: $vgpr8
-; GFX10-NEXT: .LBB12_3: ; %frem.compute
+; GFX10-NEXT: .LBB12_3: ; %frem.compute77
; GFX10-NEXT: v_frexp_mant_f32_e64 v9, |v4|
; GFX10-NEXT: v_frexp_mant_f32_e64 v8, |v0|
; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v11, v0
@@ -14501,10 +14474,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v12
; GFX10-NEXT: v_div_fixup_f32 v11, v11, v9, 1.0
; GFX10-NEXT: s_cbranch_vccnz .LBB12_7
-; GFX10-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX10-NEXT: ; %bb.4: ; %frem.loop_body85.preheader
; GFX10-NEXT: s_sub_i32 s2, s2, s3
; GFX10-NEXT: s_add_i32 s2, s2, 12
-; GFX10-NEXT: .LBB12_5: ; %frem.loop_body
+; GFX10-NEXT: .LBB12_5: ; %frem.loop_body85
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_mov_b32_e32 v13, v10
; GFX10-NEXT: s_add_i32 s2, s2, -12
@@ -14520,7 +14493,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: ; %bb.6: ; %Flow125
; GFX10-NEXT: v_mov_b32_e32 v12, s2
; GFX10-NEXT: v_mov_b32_e32 v10, v13
-; GFX10-NEXT: .LBB12_7: ; %frem.loop_exit
+; GFX10-NEXT: .LBB12_7: ; %frem.loop_exit86
; GFX10-NEXT: v_add_nc_u32_e32 v12, -11, v12
; GFX10-NEXT: v_ldexp_f32 v10, v10, v12
; GFX10-NEXT: v_mul_f32_e32 v11, v10, v11
@@ -14535,7 +14508,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_ngt_f32_e64 s2, |v1|, |v5|
; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX10-NEXT: s_cbranch_vccz .LBB12_10
-; GFX10-NEXT: ; %bb.9: ; %frem.else16
+; GFX10-NEXT: ; %bb.9: ; %frem.else47
; GFX10-NEXT: v_bfi_b32 v9, 0x7fffffff, 0, v1
; GFX10-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v1|, |v5|
; GFX10-NEXT: v_cndmask_b32_e32 v9, v1, v9, vcc_lo
@@ -14543,7 +14516,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: s_branch .LBB12_16
; GFX10-NEXT: .LBB12_10:
; GFX10-NEXT: ; implicit-def: $vgpr9
-; GFX10-NEXT: .LBB12_11: ; %frem.compute15
+; GFX10-NEXT: .LBB12_11: ; %frem.compute46
; GFX10-NEXT: v_frexp_mant_f32_e64 v10, |v5|
; GFX10-NEXT: v_frexp_mant_f32_e64 v9, |v1|
; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v12, v1
@@ -14570,10 +14543,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v13
; GFX10-NEXT: v_div_fixup_f32 v12, v12, v10, 1.0
; GFX10-NEXT: s_cbranch_vccnz .LBB12_15
-; GFX10-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; GFX10-NEXT: ; %bb.12: ; %frem.loop_body54.preheader
; GFX10-NEXT: s_sub_i32 s2, s2, s3
; GFX10-NEXT: s_add_i32 s2, s2, 12
-; GFX10-NEXT: .LBB12_13: ; %frem.loop_body23
+; GFX10-NEXT: .LBB12_13: ; %frem.loop_body54
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_mov_b32_e32 v14, v11
; GFX10-NEXT: s_add_i32 s2, s2, -12
@@ -14589,7 +14562,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: ; %bb.14: ; %Flow121
; GFX10-NEXT: v_mov_b32_e32 v13, s2
; GFX10-NEXT: v_mov_b32_e32 v11, v14
-; GFX10-NEXT: .LBB12_15: ; %frem.loop_exit24
+; GFX10-NEXT: .LBB12_15: ; %frem.loop_exit55
; GFX10-NEXT: v_add_nc_u32_e32 v13, -11, v13
; GFX10-NEXT: v_ldexp_f32 v11, v11, v13
; GFX10-NEXT: v_mul_f32_e32 v12, v11, v12
@@ -14604,7 +14577,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_ngt_f32_e64 s2, |v2|, |v6|
; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX10-NEXT: s_cbranch_vccz .LBB12_18
-; GFX10-NEXT: ; %bb.17: ; %frem.else47
+; GFX10-NEXT: ; %bb.17: ; %frem.else16
; GFX10-NEXT: v_bfi_b32 v10, 0x7fffffff, 0, v2
; GFX10-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v2|, |v6|
; GFX10-NEXT: v_cndmask_b32_e32 v10, v2, v10, vcc_lo
@@ -14612,7 +14585,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: s_branch .LBB12_24
; GFX10-NEXT: .LBB12_18:
; GFX10-NEXT: ; implicit-def: $vgpr10
-; GFX10-NEXT: .LBB12_19: ; %frem.compute46
+; GFX10-NEXT: .LBB12_19: ; %frem.compute15
; GFX10-NEXT: v_frexp_mant_f32_e64 v11, |v6|
; GFX10-NEXT: v_frexp_mant_f32_e64 v10, |v2|
; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v13, v2
@@ -14639,10 +14612,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v14
; GFX10-NEXT: v_div_fixup_f32 v13, v13, v11, 1.0
; GFX10-NEXT: s_cbranch_vccnz .LBB12_23
-; GFX10-NEXT: ; %bb.20: ; %frem.loop_body54.preheader
+; GFX10-NEXT: ; %bb.20: ; %frem.loop_body23.preheader
; GFX10-NEXT: s_sub_i32 s2, s2, s3
; GFX10-NEXT: s_add_i32 s2, s2, 12
-; GFX10-NEXT: .LBB12_21: ; %frem.loop_body54
+; GFX10-NEXT: .LBB12_21: ; %frem.loop_body23
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_mov_b32_e32 v15, v12
; GFX10-NEXT: s_add_i32 s2, s2, -12
@@ -14658,7 +14631,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: ; %bb.22: ; %Flow117
; GFX10-NEXT: v_mov_b32_e32 v14, s2
; GFX10-NEXT: v_mov_b32_e32 v12, v15
-; GFX10-NEXT: .LBB12_23: ; %frem.loop_exit55
+; GFX10-NEXT: .LBB12_23: ; %frem.loop_exit24
; GFX10-NEXT: v_add_nc_u32_e32 v14, -11, v14
; GFX10-NEXT: v_ldexp_f32 v12, v12, v14
; GFX10-NEXT: v_mul_f32_e32 v13, v12, v13
@@ -14673,7 +14646,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_ngt_f32_e64 s2, |v3|, |v7|
; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX10-NEXT: s_cbranch_vccz .LBB12_26
-; GFX10-NEXT: ; %bb.25: ; %frem.else78
+; GFX10-NEXT: ; %bb.25: ; %frem.else
; GFX10-NEXT: v_bfi_b32 v11, 0x7fffffff, 0, v3
; GFX10-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v3|, |v7|
; GFX10-NEXT: v_cndmask_b32_e32 v11, v3, v11, vcc_lo
@@ -14681,7 +14654,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: s_branch .LBB12_32
; GFX10-NEXT: .LBB12_26:
; GFX10-NEXT: ; implicit-def: $vgpr11
-; GFX10-NEXT: .LBB12_27: ; %frem.compute77
+; GFX10-NEXT: .LBB12_27: ; %frem.compute
; GFX10-NEXT: v_frexp_mant_f32_e64 v12, |v7|
; GFX10-NEXT: v_frexp_mant_f32_e64 v11, |v3|
; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v14, v3
@@ -14708,10 +14681,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v15
; GFX10-NEXT: v_div_fixup_f32 v14, v14, v12, 1.0
; GFX10-NEXT: s_cbranch_vccnz .LBB12_31
-; GFX10-NEXT: ; %bb.28: ; %frem.loop_body85.preheader
+; GFX10-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; GFX10-NEXT: s_sub_i32 s2, s2, s3
; GFX10-NEXT: s_add_i32 s2, s2, 12
-; GFX10-NEXT: .LBB12_29: ; %frem.loop_body85
+; GFX10-NEXT: .LBB12_29: ; %frem.loop_body
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_mov_b32_e32 v16, v13
; GFX10-NEXT: s_add_i32 s2, s2, -12
@@ -14727,7 +14700,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: ; %bb.30: ; %Flow
; GFX10-NEXT: v_mov_b32_e32 v15, s2
; GFX10-NEXT: v_mov_b32_e32 v13, v16
-; GFX10-NEXT: .LBB12_31: ; %frem.loop_exit86
+; GFX10-NEXT: .LBB12_31: ; %frem.loop_exit
; GFX10-NEXT: v_add_nc_u32_e32 v15, -11, v15
; GFX10-NEXT: v_ldexp_f32 v13, v13, v15
; GFX10-NEXT: v_mul_f32_e32 v14, v13, v14
@@ -14773,7 +14746,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: v_cmp_ngt_f32_e64 s2, |v0|, |v4|
; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX11-NEXT: s_cbranch_vccz .LBB12_2
-; GFX11-NEXT: ; %bb.1: ; %frem.else
+; GFX11-NEXT: ; %bb.1: ; %frem.else78
; GFX11-NEXT: v_bfi_b32 v8, 0x7fffffff, 0, v0
; GFX11-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v0|, |v4|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
@@ -14782,7 +14755,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: s_branch .LBB12_8
; GFX11-NEXT: .LBB12_2:
; GFX11-NEXT: ; implicit-def: $vgpr8
-; GFX11-NEXT: .LBB12_3: ; %frem.compute
+; GFX11-NEXT: .LBB12_3: ; %frem.compute77
; GFX11-NEXT: v_frexp_mant_f32_e64 v9, |v4|
; GFX11-NEXT: v_frexp_mant_f32_e64 v8, |v0|
; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v11, v0
@@ -14818,11 +14791,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_div_fixup_f32 v11, v11, v9, 1.0
; GFX11-NEXT: s_cbranch_vccnz .LBB12_7
-; GFX11-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX11-NEXT: ; %bb.4: ; %frem.loop_body85.preheader
; GFX11-NEXT: s_sub_i32 s2, s2, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_i32 s2, s2, 12
-; GFX11-NEXT: .LBB12_5: ; %frem.loop_body
+; GFX11-NEXT: .LBB12_5: ; %frem.loop_body85
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v13, v10
@@ -14842,7 +14815,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: ; %bb.6: ; %Flow125
; GFX11-NEXT: v_mov_b32_e32 v12, s2
; GFX11-NEXT: v_mov_b32_e32 v10, v13
-; GFX11-NEXT: .LBB12_7: ; %frem.loop_exit
+; GFX11-NEXT: .LBB12_7: ; %frem.loop_exit86
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_nc_u32_e32 v12, -11, v12
; GFX11-NEXT: v_ldexp_f32 v10, v10, v12
@@ -14862,7 +14835,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: v_cmp_ngt_f32_e64 s2, |v1|, |v5|
; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX11-NEXT: s_cbranch_vccz .LBB12_10
-; GFX11-NEXT: ; %bb.9: ; %frem.else16
+; GFX11-NEXT: ; %bb.9: ; %frem.else47
; GFX11-NEXT: v_bfi_b32 v9, 0x7fffffff, 0, v1
; GFX11-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v1|, |v5|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
@@ -14871,7 +14844,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: s_branch .LBB12_16
; GFX11-NEXT: .LBB12_10:
; GFX11-NEXT: ; implicit-def: $vgpr9
-; GFX11-NEXT: .LBB12_11: ; %frem.compute15
+; GFX11-NEXT: .LBB12_11: ; %frem.compute46
; GFX11-NEXT: v_frexp_mant_f32_e64 v10, |v5|
; GFX11-NEXT: v_frexp_mant_f32_e64 v9, |v1|
; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v12, v1
@@ -14907,11 +14880,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_div_fixup_f32 v12, v12, v10, 1.0
; GFX11-NEXT: s_cbranch_vccnz .LBB12_15
-; GFX11-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; GFX11-NEXT: ; %bb.12: ; %frem.loop_body54.preheader
; GFX11-NEXT: s_sub_i32 s2, s2, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_i32 s2, s2, 12
-; GFX11-NEXT: .LBB12_13: ; %frem.loop_body23
+; GFX11-NEXT: .LBB12_13: ; %frem.loop_body54
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v14, v11
@@ -14931,7 +14904,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: ; %bb.14: ; %Flow121
; GFX11-NEXT: v_mov_b32_e32 v13, s2
; GFX11-NEXT: v_mov_b32_e32 v11, v14
-; GFX11-NEXT: .LBB12_15: ; %frem.loop_exit24
+; GFX11-NEXT: .LBB12_15: ; %frem.loop_exit55
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_nc_u32_e32 v13, -11, v13
; GFX11-NEXT: v_ldexp_f32 v11, v11, v13
@@ -14951,7 +14924,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: v_cmp_ngt_f32_e64 s2, |v2|, |v6|
; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX11-NEXT: s_cbranch_vccz .LBB12_18
-; GFX11-NEXT: ; %bb.17: ; %frem.else47
+; GFX11-NEXT: ; %bb.17: ; %frem.else16
; GFX11-NEXT: v_bfi_b32 v10, 0x7fffffff, 0, v2
; GFX11-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v2|, |v6|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
@@ -14960,7 +14933,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: s_branch .LBB12_24
; GFX11-NEXT: .LBB12_18:
; GFX11-NEXT: ; implicit-def: $vgpr10
-; GFX11-NEXT: .LBB12_19: ; %frem.compute46
+; GFX11-NEXT: .LBB12_19: ; %frem.compute15
; GFX11-NEXT: v_frexp_mant_f32_e64 v11, |v6|
; GFX11-NEXT: v_frexp_mant_f32_e64 v10, |v2|
; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v13, v2
@@ -14996,11 +14969,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_div_fixup_f32 v13, v13, v11, 1.0
; GFX11-NEXT: s_cbranch_vccnz .LBB12_23
-; GFX11-NEXT: ; %bb.20: ; %frem.loop_body54.preheader
+; GFX11-NEXT: ; %bb.20: ; %frem.loop_body23.preheader
; GFX11-NEXT: s_sub_i32 s2, s2, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_i32 s2, s2, 12
-; GFX11-NEXT: .LBB12_21: ; %frem.loop_body54
+; GFX11-NEXT: .LBB12_21: ; %frem.loop_body23
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v15, v12
@@ -15020,7 +14993,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: ; %bb.22: ; %Flow117
; GFX11-NEXT: v_mov_b32_e32 v14, s2
; GFX11-NEXT: v_mov_b32_e32 v12, v15
-; GFX11-NEXT: .LBB12_23: ; %frem.loop_exit55
+; GFX11-NEXT: .LBB12_23: ; %frem.loop_exit24
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_nc_u32_e32 v14, -11, v14
; GFX11-NEXT: v_ldexp_f32 v12, v12, v14
@@ -15040,7 +15013,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: v_cmp_ngt_f32_e64 s2, |v3|, |v7|
; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX11-NEXT: s_cbranch_vccz .LBB12_26
-; GFX11-NEXT: ; %bb.25: ; %frem.else78
+; GFX11-NEXT: ; %bb.25: ; %frem.else
; GFX11-NEXT: v_bfi_b32 v11, 0x7fffffff, 0, v3
; GFX11-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v3|, |v7|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
@@ -15049,7 +15022,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: s_branch .LBB12_32
; GFX11-NEXT: .LBB12_26:
; GFX11-NEXT: ; implicit-def: $vgpr11
-; GFX11-NEXT: .LBB12_27: ; %frem.compute77
+; GFX11-NEXT: .LBB12_27: ; %frem.compute
; GFX11-NEXT: v_frexp_mant_f32_e64 v12, |v7|
; GFX11-NEXT: v_frexp_mant_f32_e64 v11, |v3|
; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v14, v3
@@ -15085,11 +15058,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_div_fixup_f32 v14, v14, v12, 1.0
; GFX11-NEXT: s_cbranch_vccnz .LBB12_31
-; GFX11-NEXT: ; %bb.28: ; %frem.loop_body85.preheader
+; GFX11-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; GFX11-NEXT: s_sub_i32 s2, s2, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_i32 s2, s2, 12
-; GFX11-NEXT: .LBB12_29: ; %frem.loop_body85
+; GFX11-NEXT: .LBB12_29: ; %frem.loop_body
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v16, v13
@@ -15109,7 +15082,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: ; %bb.30: ; %Flow
; GFX11-NEXT: v_mov_b32_e32 v15, s2
; GFX11-NEXT: v_mov_b32_e32 v13, v16
-; GFX11-NEXT: .LBB12_31: ; %frem.loop_exit86
+; GFX11-NEXT: .LBB12_31: ; %frem.loop_exit
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_nc_u32_e32 v15, -11, v15
; GFX11-NEXT: v_ldexp_f32 v13, v13, v15
@@ -15170,7 +15143,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-NEXT: s_cmp_ngt_f32 s5, s12
; GFX1150-NEXT: s_cbranch_scc0 .LBB12_2
-; GFX1150-NEXT: ; %bb.1: ; %frem.else
+; GFX1150-NEXT: ; %bb.1: ; %frem.else78
; GFX1150-NEXT: s_cmp_eq_f32 s5, s12
; GFX1150-NEXT: v_bfi_b32 v0, 0x7fffffff, 0, s8
; GFX1150-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -15180,7 +15153,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: s_branch .LBB12_8
; GFX1150-NEXT: .LBB12_2:
; GFX1150-NEXT: ; implicit-def: $vgpr0
-; GFX1150-NEXT: .LBB12_3: ; %frem.compute
+; GFX1150-NEXT: .LBB12_3: ; %frem.compute77
; GFX1150-NEXT: v_frexp_mant_f32_e64 v1, |s6|
; GFX1150-NEXT: v_frexp_mant_f32_e64 v0, |s8|
; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v3, s8
@@ -15215,11 +15188,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v4
; GFX1150-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; GFX1150-NEXT: s_cbranch_vccnz .LBB12_7
-; GFX1150-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX1150-NEXT: ; %bb.4: ; %frem.loop_body85.preheader
; GFX1150-NEXT: s_sub_i32 s11, s11, s12
; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-NEXT: s_add_i32 s11, s11, 12
-; GFX1150-NEXT: .LBB12_5: ; %frem.loop_body
+; GFX1150-NEXT: .LBB12_5: ; %frem.loop_body85
; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-NEXT: v_mov_b32_e32 v5, v2
@@ -15241,7 +15214,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: ; %bb.6: ; %Flow125
; GFX1150-NEXT: v_mov_b32_e32 v4, s11
; GFX1150-NEXT: v_mov_b32_e32 v2, v5
-; GFX1150-NEXT: .LBB12_7: ; %frem.loop_exit
+; GFX1150-NEXT: .LBB12_7: ; %frem.loop_exit86
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-NEXT: v_add_nc_u32_e32 v4, -11, v4
; GFX1150-NEXT: v_ldexp_f32 v2, v2, v4
@@ -15264,7 +15237,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-NEXT: s_cmp_ngt_f32 s8, s12
; GFX1150-NEXT: s_cbranch_scc0 .LBB12_10
-; GFX1150-NEXT: ; %bb.9: ; %frem.else16
+; GFX1150-NEXT: ; %bb.9: ; %frem.else47
; GFX1150-NEXT: s_cmp_eq_f32 s8, s12
; GFX1150-NEXT: v_bfi_b32 v1, 0x7fffffff, 0, s10
; GFX1150-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -15274,7 +15247,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: s_branch .LBB12_16
; GFX1150-NEXT: .LBB12_10:
; GFX1150-NEXT: ; implicit-def: $vgpr1
-; GFX1150-NEXT: .LBB12_11: ; %frem.compute15
+; GFX1150-NEXT: .LBB12_11: ; %frem.compute46
; GFX1150-NEXT: v_frexp_mant_f32_e64 v2, |s4|
; GFX1150-NEXT: v_frexp_mant_f32_e64 v1, |s10|
; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v4, s10
@@ -15309,11 +15282,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v5
; GFX1150-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; GFX1150-NEXT: s_cbranch_vccnz .LBB12_15
-; GFX1150-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; GFX1150-NEXT: ; %bb.12: ; %frem.loop_body54.preheader
; GFX1150-NEXT: s_sub_i32 s11, s11, s12
; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-NEXT: s_add_i32 s11, s11, 12
-; GFX1150-NEXT: .LBB12_13: ; %frem.loop_body23
+; GFX1150-NEXT: .LBB12_13: ; %frem.loop_body54
; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-NEXT: v_mov_b32_e32 v6, v3
@@ -15335,7 +15308,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: ; %bb.14: ; %Flow121
; GFX1150-NEXT: v_mov_b32_e32 v5, s11
; GFX1150-NEXT: v_mov_b32_e32 v3, v6
-; GFX1150-NEXT: .LBB12_15: ; %frem.loop_exit24
+; GFX1150-NEXT: .LBB12_15: ; %frem.loop_exit55
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-NEXT: v_add_nc_u32_e32 v5, -11, v5
; GFX1150-NEXT: v_ldexp_f32 v3, v3, v5
@@ -15358,7 +15331,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-NEXT: s_cmp_ngt_f32 s10, s12
; GFX1150-NEXT: s_cbranch_scc0 .LBB12_18
-; GFX1150-NEXT: ; %bb.17: ; %frem.else47
+; GFX1150-NEXT: ; %bb.17: ; %frem.else16
; GFX1150-NEXT: s_cmp_eq_f32 s10, s12
; GFX1150-NEXT: v_bfi_b32 v2, 0x7fffffff, 0, s9
; GFX1150-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -15368,7 +15341,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: s_branch .LBB12_24
; GFX1150-NEXT: .LBB12_18:
; GFX1150-NEXT: ; implicit-def: $vgpr2
-; GFX1150-NEXT: .LBB12_19: ; %frem.compute46
+; GFX1150-NEXT: .LBB12_19: ; %frem.compute15
; GFX1150-NEXT: v_frexp_mant_f32_e64 v3, |s3|
; GFX1150-NEXT: v_frexp_mant_f32_e64 v2, |s9|
; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v5, s9
@@ -15403,11 +15376,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v6
; GFX1150-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0
; GFX1150-NEXT: s_cbranch_vccnz .LBB12_23
-; GFX1150-NEXT: ; %bb.20: ; %frem.loop_body54.preheader
+; GFX1150-NEXT: ; %bb.20: ; %frem.loop_body23.preheader
; GFX1150-NEXT: s_sub_i32 s11, s11, s12
; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-NEXT: s_add_i32 s11, s11, 12
-; GFX1150-NEXT: .LBB12_21: ; %frem.loop_body54
+; GFX1150-NEXT: .LBB12_21: ; %frem.loop_body23
; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-NEXT: v_mov_b32_e32 v7, v4
@@ -15429,7 +15402,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: ; %bb.22: ; %Flow117
; GFX1150-NEXT: v_mov_b32_e32 v6, s11
; GFX1150-NEXT: v_mov_b32_e32 v4, v7
-; GFX1150-NEXT: .LBB12_23: ; %frem.loop_exit55
+; GFX1150-NEXT: .LBB12_23: ; %frem.loop_exit24
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-NEXT: v_add_nc_u32_e32 v6, -11, v6
; GFX1150-NEXT: v_ldexp_f32 v4, v4, v6
@@ -15452,7 +15425,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-NEXT: s_cmp_ngt_f32 s9, s12
; GFX1150-NEXT: s_cbranch_scc0 .LBB12_26
-; GFX1150-NEXT: ; %bb.25: ; %frem.else78
+; GFX1150-NEXT: ; %bb.25: ; %frem.else
; GFX1150-NEXT: s_cmp_eq_f32 s9, s12
; GFX1150-NEXT: v_bfi_b32 v3, 0x7fffffff, 0, s7
; GFX1150-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -15462,7 +15435,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: s_branch .LBB12_32
; GFX1150-NEXT: .LBB12_26:
; GFX1150-NEXT: ; implicit-def: $vgpr3
-; GFX1150-NEXT: .LBB12_27: ; %frem.compute77
+; GFX1150-NEXT: .LBB12_27: ; %frem.compute
; GFX1150-NEXT: v_frexp_mant_f32_e64 v4, |s2|
; GFX1150-NEXT: v_frexp_mant_f32_e64 v3, |s7|
; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v6, s7
@@ -15497,11 +15470,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v7
; GFX1150-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0
; GFX1150-NEXT: s_cbranch_vccnz .LBB12_31
-; GFX1150-NEXT: ; %bb.28: ; %frem.loop_body85.preheader
+; GFX1150-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; GFX1150-NEXT: s_sub_i32 s11, s11, s12
; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-NEXT: s_add_i32 s11, s11, 12
-; GFX1150-NEXT: .LBB12_29: ; %frem.loop_body85
+; GFX1150-NEXT: .LBB12_29: ; %frem.loop_body
; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-NEXT: v_mov_b32_e32 v8, v5
@@ -15523,7 +15496,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: ; %bb.30: ; %Flow
; GFX1150-NEXT: v_mov_b32_e32 v7, s11
; GFX1150-NEXT: v_mov_b32_e32 v5, v8
-; GFX1150-NEXT: .LBB12_31: ; %frem.loop_exit86
+; GFX1150-NEXT: .LBB12_31: ; %frem.loop_exit
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-NEXT: v_add_nc_u32_e32 v7, -11, v7
; GFX1150-NEXT: v_ldexp_f32 v5, v5, v7
@@ -15597,7 +15570,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1200-NEXT: s_cmp_ngt_f32 s5, s12
; GFX1200-NEXT: s_cbranch_scc0 .LBB12_2
-; GFX1200-NEXT: ; %bb.1: ; %frem.else
+; GFX1200-NEXT: ; %bb.1: ; %frem.else78
; GFX1200-NEXT: s_cmp_eq_f32 s5, s12
; GFX1200-NEXT: v_bfi_b32 v0, 0x7fffffff, 0, s8
; GFX1200-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -15607,7 +15580,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: s_branch .LBB12_8
; GFX1200-NEXT: .LBB12_2:
; GFX1200-NEXT: ; implicit-def: $vgpr0
-; GFX1200-NEXT: .LBB12_3: ; %frem.compute
+; GFX1200-NEXT: .LBB12_3: ; %frem.compute77
; GFX1200-NEXT: v_frexp_mant_f32_e64 v1, |s6|
; GFX1200-NEXT: v_frexp_mant_f32_e64 v0, |s8|
; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v3, s8
@@ -15643,11 +15616,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v4
; GFX1200-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; GFX1200-NEXT: s_cbranch_vccnz .LBB12_7
-; GFX1200-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX1200-NEXT: ; %bb.4: ; %frem.loop_body85.preheader
; GFX1200-NEXT: s_sub_co_i32 s11, s11, s12
; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1200-NEXT: s_add_co_i32 s11, s11, 12
-; GFX1200-NEXT: .LBB12_5: ; %frem.loop_body
+; GFX1200-NEXT: .LBB12_5: ; %frem.loop_body85
; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1200-NEXT: v_mov_b32_e32 v5, v2
@@ -15670,7 +15643,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: ; %bb.6: ; %Flow125
; GFX1200-NEXT: v_mov_b32_e32 v4, s11
; GFX1200-NEXT: v_mov_b32_e32 v2, v5
-; GFX1200-NEXT: .LBB12_7: ; %frem.loop_exit
+; GFX1200-NEXT: .LBB12_7: ; %frem.loop_exit86
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-NEXT: v_add_nc_u32_e32 v4, -11, v4
; GFX1200-NEXT: v_ldexp_f32 v2, v2, v4
@@ -15694,7 +15667,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_cmp_ngt_f32 s8, s12
; GFX1200-NEXT: s_cbranch_scc0 .LBB12_10
-; GFX1200-NEXT: ; %bb.9: ; %frem.else16
+; GFX1200-NEXT: ; %bb.9: ; %frem.else47
; GFX1200-NEXT: s_cmp_eq_f32 s8, s12
; GFX1200-NEXT: v_bfi_b32 v1, 0x7fffffff, 0, s10
; GFX1200-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -15705,7 +15678,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: s_branch .LBB12_16
; GFX1200-NEXT: .LBB12_10:
; GFX1200-NEXT: ; implicit-def: $vgpr1
-; GFX1200-NEXT: .LBB12_11: ; %frem.compute15
+; GFX1200-NEXT: .LBB12_11: ; %frem.compute46
; GFX1200-NEXT: v_frexp_mant_f32_e64 v2, |s4|
; GFX1200-NEXT: v_frexp_mant_f32_e64 v1, |s10|
; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v4, s10
@@ -15741,11 +15714,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v5
; GFX1200-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; GFX1200-NEXT: s_cbranch_vccnz .LBB12_15
-; GFX1200-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; GFX1200-NEXT: ; %bb.12: ; %frem.loop_body54.preheader
; GFX1200-NEXT: s_sub_co_i32 s11, s11, s12
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_add_co_i32 s11, s11, 12
-; GFX1200-NEXT: .LBB12_13: ; %frem.loop_body23
+; GFX1200-NEXT: .LBB12_13: ; %frem.loop_body54
; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-NEXT: v_mov_b32_e32 v6, v3
@@ -15769,7 +15742,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: ; %bb.14: ; %Flow121
; GFX1200-NEXT: v_mov_b32_e32 v5, s11
; GFX1200-NEXT: v_mov_b32_e32 v3, v6
-; GFX1200-NEXT: .LBB12_15: ; %frem.loop_exit24
+; GFX1200-NEXT: .LBB12_15: ; %frem.loop_exit55
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-NEXT: v_add_nc_u32_e32 v5, -11, v5
; GFX1200-NEXT: v_ldexp_f32 v3, v3, v5
@@ -15793,7 +15766,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_cmp_ngt_f32 s10, s12
; GFX1200-NEXT: s_cbranch_scc0 .LBB12_18
-; GFX1200-NEXT: ; %bb.17: ; %frem.else47
+; GFX1200-NEXT: ; %bb.17: ; %frem.else16
; GFX1200-NEXT: s_cmp_eq_f32 s10, s12
; GFX1200-NEXT: v_bfi_b32 v2, 0x7fffffff, 0, s9
; GFX1200-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -15804,7 +15777,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: s_branch .LBB12_24
; GFX1200-NEXT: .LBB12_18:
; GFX1200-NEXT: ; implicit-def: $vgpr2
-; GFX1200-NEXT: .LBB12_19: ; %frem.compute46
+; GFX1200-NEXT: .LBB12_19: ; %frem.compute15
; GFX1200-NEXT: v_frexp_mant_f32_e64 v3, |s3|
; GFX1200-NEXT: v_frexp_mant_f32_e64 v2, |s9|
; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v5, s9
@@ -15840,11 +15813,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v6
; GFX1200-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0
; GFX1200-NEXT: s_cbranch_vccnz .LBB12_23
-; GFX1200-NEXT: ; %bb.20: ; %frem.loop_body54.preheader
+; GFX1200-NEXT: ; %bb.20: ; %frem.loop_body23.preheader
; GFX1200-NEXT: s_sub_co_i32 s11, s11, s12
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_add_co_i32 s11, s11, 12
-; GFX1200-NEXT: .LBB12_21: ; %frem.loop_body54
+; GFX1200-NEXT: .LBB12_21: ; %frem.loop_body23
; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-NEXT: v_mov_b32_e32 v7, v4
@@ -15868,7 +15841,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: ; %bb.22: ; %Flow117
; GFX1200-NEXT: v_mov_b32_e32 v6, s11
; GFX1200-NEXT: v_mov_b32_e32 v4, v7
-; GFX1200-NEXT: .LBB12_23: ; %frem.loop_exit55
+; GFX1200-NEXT: .LBB12_23: ; %frem.loop_exit24
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-NEXT: v_add_nc_u32_e32 v6, -11, v6
; GFX1200-NEXT: v_ldexp_f32 v4, v4, v6
@@ -15892,7 +15865,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_cmp_ngt_f32 s9, s12
; GFX1200-NEXT: s_cbranch_scc0 .LBB12_26
-; GFX1200-NEXT: ; %bb.25: ; %frem.else78
+; GFX1200-NEXT: ; %bb.25: ; %frem.else
; GFX1200-NEXT: s_cmp_eq_f32 s9, s12
; GFX1200-NEXT: v_bfi_b32 v3, 0x7fffffff, 0, s7
; GFX1200-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -15903,7 +15876,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: s_branch .LBB12_32
; GFX1200-NEXT: .LBB12_26:
; GFX1200-NEXT: ; implicit-def: $vgpr3
-; GFX1200-NEXT: .LBB12_27: ; %frem.compute77
+; GFX1200-NEXT: .LBB12_27: ; %frem.compute
; GFX1200-NEXT: v_frexp_mant_f32_e64 v4, |s2|
; GFX1200-NEXT: v_frexp_mant_f32_e64 v3, |s7|
; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v6, s7
@@ -15939,11 +15912,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v7
; GFX1200-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0
; GFX1200-NEXT: s_cbranch_vccnz .LBB12_31
-; GFX1200-NEXT: ; %bb.28: ; %frem.loop_body85.preheader
+; GFX1200-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; GFX1200-NEXT: s_sub_co_i32 s11, s11, s12
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_add_co_i32 s11, s11, 12
-; GFX1200-NEXT: .LBB12_29: ; %frem.loop_body85
+; GFX1200-NEXT: .LBB12_29: ; %frem.loop_body
; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-NEXT: v_mov_b32_e32 v8, v5
@@ -15967,7 +15940,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: ; %bb.30: ; %Flow
; GFX1200-NEXT: v_mov_b32_e32 v7, s11
; GFX1200-NEXT: v_mov_b32_e32 v5, v8
-; GFX1200-NEXT: .LBB12_31: ; %frem.loop_exit86
+; GFX1200-NEXT: .LBB12_31: ; %frem.loop_exit
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-NEXT: v_add_nc_u32_e32 v7, -11, v7
; GFX1200-NEXT: v_ldexp_f32 v5, v5, v7
@@ -16048,7 +16021,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_cmp_ngt_f64_e64 s[0:1], |v[0:1]|, |v[4:5]|
; SI-NEXT: s_and_b64 vcc, exec, s[0:1]
; SI-NEXT: s_cbranch_vccz .LBB13_2
-; SI-NEXT: ; %bb.1: ; %frem.else
+; SI-NEXT: ; %bb.1: ; %frem.else16
; SI-NEXT: v_and_b32_e32 v8, 0x80000000, v1
; SI-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[4:5]|
; SI-NEXT: v_cndmask_b32_e32 v9, v1, v8, vcc
@@ -16059,7 +16032,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: .LBB13_2:
; SI-NEXT: ; implicit-def: $vgpr8_vgpr9
; SI-NEXT: s_mov_b64 vcc, 0
-; SI-NEXT: .LBB13_3: ; %frem.compute
+; SI-NEXT: .LBB13_3: ; %frem.compute15
; SI-NEXT: s_brev_b32 s5, -2
; SI-NEXT: v_and_b32_e32 v10, 0x7fffffff, v1
; SI-NEXT: s_mov_b32 s0, 0
@@ -16105,13 +16078,13 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0
; SI-NEXT: s_cmp_lt_i32 s6, 27
; SI-NEXT: s_cbranch_scc1 .LBB13_7
-; SI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; SI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; SI-NEXT: s_sub_i32 s0, s3, s7
; SI-NEXT: s_add_i32 s6, s0, 26
; SI-NEXT: s_mov_b32 s3, 0x432fffff
; SI-NEXT: v_mov_b32_e32 v18, 0x43300000
; SI-NEXT: v_mov_b32_e32 v14, 0
-; SI-NEXT: .LBB13_5: ; %frem.loop_body
+; SI-NEXT: .LBB13_5: ; %frem.loop_body23
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: v_mov_b32_e32 v17, v11
; SI-NEXT: v_mov_b32_e32 v16, v10
@@ -16134,7 +16107,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: ; %bb.6: ; %Flow51
; SI-NEXT: v_mov_b32_e32 v10, v16
; SI-NEXT: v_mov_b32_e32 v11, v17
-; SI-NEXT: .LBB13_7: ; %frem.loop_exit
+; SI-NEXT: .LBB13_7: ; %frem.loop_exit24
; SI-NEXT: s_sub_i32 s0, s6, 25
; SI-NEXT: v_ldexp_f64 v[10:11], v[10:11], s0
; SI-NEXT: v_mul_f64 v[12:13], v[10:11], v[12:13]
@@ -16160,7 +16133,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_cmp_ngt_f64_e64 s[0:1], |v[2:3]|, |v[6:7]|
; SI-NEXT: s_and_b64 vcc, exec, s[0:1]
; SI-NEXT: s_cbranch_vccz .LBB13_10
-; SI-NEXT: ; %bb.9: ; %frem.else16
+; SI-NEXT: ; %bb.9: ; %frem.else
; SI-NEXT: v_and_b32_e32 v10, 0x80000000, v3
; SI-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, |v[6:7]|
; SI-NEXT: v_cndmask_b32_e32 v11, v3, v10, vcc
@@ -16171,7 +16144,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: .LBB13_10:
; SI-NEXT: ; implicit-def: $vgpr10_vgpr11
; SI-NEXT: s_mov_b64 vcc, 0
-; SI-NEXT: .LBB13_11: ; %frem.compute15
+; SI-NEXT: .LBB13_11: ; %frem.compute
; SI-NEXT: s_brev_b32 s5, -2
; SI-NEXT: v_and_b32_e32 v12, 0x7fffffff, v3
; SI-NEXT: s_mov_b32 s0, 0
@@ -16217,13 +16190,13 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0
; SI-NEXT: s_cmp_lt_i32 s6, 27
; SI-NEXT: s_cbranch_scc1 .LBB13_15
-; SI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; SI-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; SI-NEXT: s_sub_i32 s0, s3, s7
; SI-NEXT: s_add_i32 s6, s0, 26
; SI-NEXT: s_mov_b32 s3, 0x432fffff
; SI-NEXT: v_mov_b32_e32 v20, 0x43300000
; SI-NEXT: v_mov_b32_e32 v16, 0
-; SI-NEXT: .LBB13_13: ; %frem.loop_body23
+; SI-NEXT: .LBB13_13: ; %frem.loop_body
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: v_mov_b32_e32 v19, v13
; SI-NEXT: v_mov_b32_e32 v18, v12
@@ -16246,7 +16219,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: ; %bb.14: ; %Flow
; SI-NEXT: v_mov_b32_e32 v12, v18
; SI-NEXT: v_mov_b32_e32 v13, v19
-; SI-NEXT: .LBB13_15: ; %frem.loop_exit24
+; SI-NEXT: .LBB13_15: ; %frem.loop_exit
; SI-NEXT: s_sub_i32 s0, s6, 25
; SI-NEXT: v_ldexp_f64 v[12:13], v[12:13], s0
; SI-NEXT: v_mul_f64 v[14:15], v[12:13], v[14:15]
@@ -16304,7 +16277,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]|
; CI-NEXT: s_and_b64 vcc, exec, s[2:3]
; CI-NEXT: s_cbranch_vccz .LBB13_2
-; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: ; %bb.1: ; %frem.else16
; CI-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[4:5]|
; CI-NEXT: v_and_b32_e32 v8, 0x80000000, v1
; CI-NEXT: v_cndmask_b32_e32 v9, v1, v8, vcc
@@ -16313,7 +16286,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB13_8
; CI-NEXT: .LBB13_2:
; CI-NEXT: ; implicit-def: $vgpr8_vgpr9
-; CI-NEXT: .LBB13_3: ; %frem.compute
+; CI-NEXT: .LBB13_3: ; %frem.compute15
; CI-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[0:1]|
; CI-NEXT: v_frexp_exp_i32_f64_e32 v15, v[4:5]
; CI-NEXT: v_frexp_exp_i32_f64_e32 v14, v[0:1]
@@ -16337,10 +16310,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v17
; CI-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0
; CI-NEXT: s_cbranch_vccnz .LBB13_7
-; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; CI-NEXT: v_sub_i32_e32 v14, vcc, v14, v15
; CI-NEXT: v_add_i32_e32 v17, vcc, 26, v14
-; CI-NEXT: .LBB13_5: ; %frem.loop_body
+; CI-NEXT: .LBB13_5: ; %frem.loop_body23
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v15, v11
; CI-NEXT: v_mov_b32_e32 v14, v10
@@ -16358,7 +16331,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: ; %bb.6: ; %Flow51
; CI-NEXT: v_mov_b32_e32 v10, v14
; CI-NEXT: v_mov_b32_e32 v11, v15
-; CI-NEXT: .LBB13_7: ; %frem.loop_exit
+; CI-NEXT: .LBB13_7: ; %frem.loop_exit24
; CI-NEXT: v_subrev_i32_e32 v14, vcc, 25, v17
; CI-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14
; CI-NEXT: s_brev_b32 s2, -2
@@ -16375,7 +16348,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, |v[6:7]|
; CI-NEXT: s_and_b64 vcc, exec, s[2:3]
; CI-NEXT: s_cbranch_vccz .LBB13_10
-; CI-NEXT: ; %bb.9: ; %frem.else16
+; CI-NEXT: ; %bb.9: ; %frem.else
; CI-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, |v[6:7]|
; CI-NEXT: v_and_b32_e32 v10, 0x80000000, v3
; CI-NEXT: v_cndmask_b32_e32 v11, v3, v10, vcc
@@ -16384,7 +16357,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB13_16
; CI-NEXT: .LBB13_10:
; CI-NEXT: ; implicit-def: $vgpr10_vgpr11
-; CI-NEXT: .LBB13_11: ; %frem.compute15
+; CI-NEXT: .LBB13_11: ; %frem.compute
; CI-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[2:3]|
; CI-NEXT: v_frexp_exp_i32_f64_e32 v17, v[6:7]
; CI-NEXT: v_frexp_exp_i32_f64_e32 v16, v[2:3]
@@ -16408,10 +16381,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v19
; CI-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0
; CI-NEXT: s_cbranch_vccnz .LBB13_15
-; CI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; CI-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; CI-NEXT: v_sub_i32_e32 v16, vcc, v16, v17
; CI-NEXT: v_add_i32_e32 v19, vcc, 26, v16
-; CI-NEXT: .LBB13_13: ; %frem.loop_body23
+; CI-NEXT: .LBB13_13: ; %frem.loop_body
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v17, v13
; CI-NEXT: v_mov_b32_e32 v16, v12
@@ -16429,7 +16402,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: ; %bb.14: ; %Flow
; CI-NEXT: v_mov_b32_e32 v12, v16
; CI-NEXT: v_mov_b32_e32 v13, v17
-; CI-NEXT: .LBB13_15: ; %frem.loop_exit24
+; CI-NEXT: .LBB13_15: ; %frem.loop_exit
; CI-NEXT: v_subrev_i32_e32 v16, vcc, 25, v19
; CI-NEXT: v_ldexp_f64 v[12:13], v[12:13], v16
; CI-NEXT: s_brev_b32 s2, -2
@@ -16478,7 +16451,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]|
; VI-NEXT: s_and_b64 vcc, exec, s[2:3]
; VI-NEXT: s_cbranch_vccz .LBB13_2
-; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: ; %bb.1: ; %frem.else16
; VI-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[4:5]|
; VI-NEXT: v_and_b32_e32 v8, 0x80000000, v1
; VI-NEXT: v_cndmask_b32_e32 v9, v1, v8, vcc
@@ -16487,7 +16460,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB13_8
; VI-NEXT: .LBB13_2:
; VI-NEXT: ; implicit-def: $vgpr8_vgpr9
-; VI-NEXT: .LBB13_3: ; %frem.compute
+; VI-NEXT: .LBB13_3: ; %frem.compute15
; VI-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[0:1]|
; VI-NEXT: v_frexp_exp_i32_f64_e32 v15, v[4:5]
; VI-NEXT: v_frexp_exp_i32_f64_e32 v14, v[0:1]
@@ -16511,10 +16484,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v17
; VI-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0
; VI-NEXT: s_cbranch_vccnz .LBB13_7
-; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; VI-NEXT: v_sub_u32_e32 v14, vcc, v14, v15
; VI-NEXT: v_add_u32_e32 v17, vcc, 26, v14
-; VI-NEXT: .LBB13_5: ; %frem.loop_body
+; VI-NEXT: .LBB13_5: ; %frem.loop_body23
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v15, v11
; VI-NEXT: v_mov_b32_e32 v14, v10
@@ -16532,7 +16505,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: ; %bb.6: ; %Flow51
; VI-NEXT: v_mov_b32_e32 v10, v14
; VI-NEXT: v_mov_b32_e32 v11, v15
-; VI-NEXT: .LBB13_7: ; %frem.loop_exit
+; VI-NEXT: .LBB13_7: ; %frem.loop_exit24
; VI-NEXT: v_subrev_u32_e32 v14, vcc, 25, v17
; VI-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14
; VI-NEXT: s_brev_b32 s2, -2
@@ -16549,7 +16522,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, |v[6:7]|
; VI-NEXT: s_and_b64 vcc, exec, s[2:3]
; VI-NEXT: s_cbranch_vccz .LBB13_10
-; VI-NEXT: ; %bb.9: ; %frem.else16
+; VI-NEXT: ; %bb.9: ; %frem.else
; VI-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, |v[6:7]|
; VI-NEXT: v_and_b32_e32 v10, 0x80000000, v3
; VI-NEXT: v_cndmask_b32_e32 v11, v3, v10, vcc
@@ -16558,7 +16531,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB13_16
; VI-NEXT: .LBB13_10:
; VI-NEXT: ; implicit-def: $vgpr10_vgpr11
-; VI-NEXT: .LBB13_11: ; %frem.compute15
+; VI-NEXT: .LBB13_11: ; %frem.compute
; VI-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[2:3]|
; VI-NEXT: v_frexp_exp_i32_f64_e32 v17, v[6:7]
; VI-NEXT: v_frexp_exp_i32_f64_e32 v16, v[2:3]
@@ -16582,10 +16555,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v19
; VI-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0
; VI-NEXT: s_cbranch_vccnz .LBB13_15
-; VI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; VI-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; VI-NEXT: v_sub_u32_e32 v16, vcc, v16, v17
; VI-NEXT: v_add_u32_e32 v19, vcc, 26, v16
-; VI-NEXT: .LBB13_13: ; %frem.loop_body23
+; VI-NEXT: .LBB13_13: ; %frem.loop_body
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v17, v13
; VI-NEXT: v_mov_b32_e32 v16, v12
@@ -16603,7 +16576,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: ; %bb.14: ; %Flow
; VI-NEXT: v_mov_b32_e32 v12, v16
; VI-NEXT: v_mov_b32_e32 v13, v17
-; VI-NEXT: .LBB13_15: ; %frem.loop_exit24
+; VI-NEXT: .LBB13_15: ; %frem.loop_exit
; VI-NEXT: v_subrev_u32_e32 v16, vcc, 25, v19
; VI-NEXT: v_ldexp_f64 v[12:13], v[12:13], v16
; VI-NEXT: s_brev_b32 s2, -2
@@ -16647,7 +16620,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]|
; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3]
; GFX9-NEXT: s_cbranch_vccz .LBB13_2
-; GFX9-NEXT: ; %bb.1: ; %frem.else
+; GFX9-NEXT: ; %bb.1: ; %frem.else16
; GFX9-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[4:5]|
; GFX9-NEXT: v_and_b32_e32 v8, 0x80000000, v1
; GFX9-NEXT: v_cndmask_b32_e32 v9, v1, v8, vcc
@@ -16656,7 +16629,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_branch .LBB13_8
; GFX9-NEXT: .LBB13_2:
; GFX9-NEXT: ; implicit-def: $vgpr8_vgpr9
-; GFX9-NEXT: .LBB13_3: ; %frem.compute
+; GFX9-NEXT: .LBB13_3: ; %frem.compute15
; GFX9-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[0:1]|
; GFX9-NEXT: v_frexp_exp_i32_f64_e32 v15, v[4:5]
; GFX9-NEXT: v_frexp_exp_i32_f64_e32 v14, v[0:1]
@@ -16680,10 +16653,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 27, v17
; GFX9-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0
; GFX9-NEXT: s_cbranch_vccnz .LBB13_7
-; GFX9-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX9-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; GFX9-NEXT: v_sub_u32_e32 v14, v14, v15
; GFX9-NEXT: v_add_u32_e32 v17, 26, v14
-; GFX9-NEXT: .LBB13_5: ; %frem.loop_body
+; GFX9-NEXT: .LBB13_5: ; %frem.loop_body23
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v15, v11
; GFX9-NEXT: v_mov_b32_e32 v14, v10
@@ -16701,7 +16674,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: ; %bb.6: ; %Flow51
; GFX9-NEXT: v_mov_b32_e32 v10, v14
; GFX9-NEXT: v_mov_b32_e32 v11, v15
-; GFX9-NEXT: .LBB13_7: ; %frem.loop_exit
+; GFX9-NEXT: .LBB13_7: ; %frem.loop_exit24
; GFX9-NEXT: v_subrev_u32_e32 v14, 25, v17
; GFX9-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14
; GFX9-NEXT: s_brev_b32 s2, -2
@@ -16718,7 +16691,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, |v[6:7]|
; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3]
; GFX9-NEXT: s_cbranch_vccz .LBB13_10
-; GFX9-NEXT: ; %bb.9: ; %frem.else16
+; GFX9-NEXT: ; %bb.9: ; %frem.else
; GFX9-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, |v[6:7]|
; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v3
; GFX9-NEXT: v_cndmask_b32_e32 v11, v3, v10, vcc
@@ -16727,7 +16700,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_branch .LBB13_16
; GFX9-NEXT: .LBB13_10:
; GFX9-NEXT: ; implicit-def: $vgpr10_vgpr11
-; GFX9-NEXT: .LBB13_11: ; %frem.compute15
+; GFX9-NEXT: .LBB13_11: ; %frem.compute
; GFX9-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[2:3]|
; GFX9-NEXT: v_frexp_exp_i32_f64_e32 v17, v[6:7]
; GFX9-NEXT: v_frexp_exp_i32_f64_e32 v16, v[2:3]
@@ -16751,10 +16724,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 27, v19
; GFX9-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0
; GFX9-NEXT: s_cbranch_vccnz .LBB13_15
-; GFX9-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; GFX9-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; GFX9-NEXT: v_sub_u32_e32 v16, v16, v17
; GFX9-NEXT: v_add_u32_e32 v19, 26, v16
-; GFX9-NEXT: .LBB13_13: ; %frem.loop_body23
+; GFX9-NEXT: .LBB13_13: ; %frem.loop_body
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v17, v13
; GFX9-NEXT: v_mov_b32_e32 v16, v12
@@ -16772,7 +16745,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: ; %bb.14: ; %Flow
; GFX9-NEXT: v_mov_b32_e32 v12, v16
; GFX9-NEXT: v_mov_b32_e32 v13, v17
-; GFX9-NEXT: .LBB13_15: ; %frem.loop_exit24
+; GFX9-NEXT: .LBB13_15: ; %frem.loop_exit
; GFX9-NEXT: v_subrev_u32_e32 v16, 25, v19
; GFX9-NEXT: v_ldexp_f64 v[12:13], v[12:13], v16
; GFX9-NEXT: s_brev_b32 s2, -2
@@ -16817,7 +16790,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, |v[4:5]|
; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX10-NEXT: s_cbranch_vccz .LBB13_2
-; GFX10-NEXT: ; %bb.1: ; %frem.else
+; GFX10-NEXT: ; %bb.1: ; %frem.else16
; GFX10-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, |v[4:5]|
; GFX10-NEXT: v_and_b32_e32 v8, 0x80000000, v1
; GFX10-NEXT: v_cndmask_b32_e32 v9, v1, v8, vcc_lo
@@ -16826,7 +16799,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: s_branch .LBB13_8
; GFX10-NEXT: .LBB13_2:
; GFX10-NEXT: ; implicit-def: $vgpr8_vgpr9
-; GFX10-NEXT: .LBB13_3: ; %frem.compute
+; GFX10-NEXT: .LBB13_3: ; %frem.compute15
; GFX10-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[0:1]|
; GFX10-NEXT: v_frexp_exp_i32_f64_e32 v13, v[4:5]
; GFX10-NEXT: v_frexp_exp_i32_f64_e32 v12, v[0:1]
@@ -16851,10 +16824,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v17
; GFX10-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0
; GFX10-NEXT: s_cbranch_vccnz .LBB13_7
-; GFX10-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX10-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; GFX10-NEXT: s_sub_i32 s2, s2, s3
; GFX10-NEXT: s_add_i32 s2, s2, 26
-; GFX10-NEXT: .LBB13_5: ; %frem.loop_body
+; GFX10-NEXT: .LBB13_5: ; %frem.loop_body23
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_mov_b32_e32 v15, v11
; GFX10-NEXT: v_mov_b32_e32 v14, v10
@@ -16873,7 +16846,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_mov_b32_e32 v10, v14
; GFX10-NEXT: v_mov_b32_e32 v17, s2
; GFX10-NEXT: v_mov_b32_e32 v11, v15
-; GFX10-NEXT: .LBB13_7: ; %frem.loop_exit
+; GFX10-NEXT: .LBB13_7: ; %frem.loop_exit24
; GFX10-NEXT: v_subrev_nc_u32_e32 v14, 25, v17
; GFX10-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14
; GFX10-NEXT: v_mul_f64 v[12:13], v[10:11], v[12:13]
@@ -16889,7 +16862,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_ngt_f64_e64 s2, |v[2:3]|, |v[6:7]|
; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX10-NEXT: s_cbranch_vccz .LBB13_10
-; GFX10-NEXT: ; %bb.9: ; %frem.else16
+; GFX10-NEXT: ; %bb.9: ; %frem.else
; GFX10-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, |v[6:7]|
; GFX10-NEXT: v_and_b32_e32 v10, 0x80000000, v3
; GFX10-NEXT: v_cndmask_b32_e32 v11, v3, v10, vcc_lo
@@ -16898,7 +16871,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: s_branch .LBB13_16
; GFX10-NEXT: .LBB13_10:
; GFX10-NEXT: ; implicit-def: $vgpr10_vgpr11
-; GFX10-NEXT: .LBB13_11: ; %frem.compute15
+; GFX10-NEXT: .LBB13_11: ; %frem.compute
; GFX10-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[2:3]|
; GFX10-NEXT: v_frexp_exp_i32_f64_e32 v15, v[6:7]
; GFX10-NEXT: v_frexp_exp_i32_f64_e32 v14, v[2:3]
@@ -16923,10 +16896,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v19
; GFX10-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0
; GFX10-NEXT: s_cbranch_vccnz .LBB13_15
-; GFX10-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; GFX10-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; GFX10-NEXT: s_sub_i32 s2, s2, s3
; GFX10-NEXT: s_add_i32 s2, s2, 26
-; GFX10-NEXT: .LBB13_13: ; %frem.loop_body23
+; GFX10-NEXT: .LBB13_13: ; %frem.loop_body
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_mov_b32_e32 v17, v13
; GFX10-NEXT: v_mov_b32_e32 v16, v12
@@ -16945,7 +16918,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_mov_b32_e32 v12, v16
; GFX10-NEXT: v_mov_b32_e32 v19, s2
; GFX10-NEXT: v_mov_b32_e32 v13, v17
-; GFX10-NEXT: .LBB13_15: ; %frem.loop_exit24
+; GFX10-NEXT: .LBB13_15: ; %frem.loop_exit
; GFX10-NEXT: v_subrev_nc_u32_e32 v16, 25, v19
; GFX10-NEXT: v_ldexp_f64 v[12:13], v[12:13], v16
; GFX10-NEXT: v_mul_f64 v[14:15], v[12:13], v[14:15]
@@ -16986,7 +16959,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, |v[4:5]|
; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX11-NEXT: s_cbranch_vccz .LBB13_2
-; GFX11-NEXT: ; %bb.1: ; %frem.else
+; GFX11-NEXT: ; %bb.1: ; %frem.else16
; GFX11-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, |v[4:5]|
; GFX11-NEXT: v_and_b32_e32 v8, 0x80000000, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -16996,7 +16969,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: s_branch .LBB13_8
; GFX11-NEXT: .LBB13_2:
; GFX11-NEXT: ; implicit-def: $vgpr8_vgpr9
-; GFX11-NEXT: .LBB13_3: ; %frem.compute
+; GFX11-NEXT: .LBB13_3: ; %frem.compute15
; GFX11-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[0:1]|
; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v13, v[4:5]
; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v12, v[0:1]
@@ -17029,12 +17002,12 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0
; GFX11-NEXT: s_cbranch_vccnz .LBB13_7
-; GFX11-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX11-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; GFX11-NEXT: s_sub_i32 s2, s2, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_i32 s2, s2, 26
; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB13_5: ; %frem.loop_body
+; GFX11-NEXT: .LBB13_5: ; %frem.loop_body23
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10
@@ -17054,7 +17027,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: ; %bb.6: ; %Flow51
; GFX11-NEXT: v_dual_mov_b32 v17, s2 :: v_dual_mov_b32 v10, v14
; GFX11-NEXT: v_mov_b32_e32 v11, v15
-; GFX11-NEXT: .LBB13_7: ; %frem.loop_exit
+; GFX11-NEXT: .LBB13_7: ; %frem.loop_exit24
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_subrev_nc_u32_e32 v14, 25, v17
; GFX11-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14
@@ -17074,7 +17047,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: v_cmp_ngt_f64_e64 s2, |v[2:3]|, |v[6:7]|
; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX11-NEXT: s_cbranch_vccz .LBB13_10
-; GFX11-NEXT: ; %bb.9: ; %frem.else16
+; GFX11-NEXT: ; %bb.9: ; %frem.else
; GFX11-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, |v[6:7]|
; GFX11-NEXT: v_and_b32_e32 v10, 0x80000000, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -17084,7 +17057,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: s_branch .LBB13_16
; GFX11-NEXT: .LBB13_10:
; GFX11-NEXT: ; implicit-def: $vgpr10_vgpr11
-; GFX11-NEXT: .LBB13_11: ; %frem.compute15
+; GFX11-NEXT: .LBB13_11: ; %frem.compute
; GFX11-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[2:3]|
; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v15, v[6:7]
; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v14, v[2:3]
@@ -17117,12 +17090,12 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0
; GFX11-NEXT: s_cbranch_vccnz .LBB13_15
-; GFX11-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; GFX11-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; GFX11-NEXT: s_sub_i32 s2, s2, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_i32 s2, s2, 26
; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB13_13: ; %frem.loop_body23
+; GFX11-NEXT: .LBB13_13: ; %frem.loop_body
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v17, v13 :: v_dual_mov_b32 v16, v12
@@ -17142,7 +17115,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: ; %bb.14: ; %Flow
; GFX11-NEXT: v_dual_mov_b32 v19, s2 :: v_dual_mov_b32 v12, v16
; GFX11-NEXT: v_mov_b32_e32 v13, v17
-; GFX11-NEXT: .LBB13_15: ; %frem.loop_exit24
+; GFX11-NEXT: .LBB13_15: ; %frem.loop_exit
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_subrev_nc_u32_e32 v16, 25, v19
; GFX11-NEXT: v_ldexp_f64 v[12:13], v[12:13], v16
@@ -17187,7 +17160,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, |v[4:5]|
; GFX1150-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX1150-NEXT: s_cbranch_vccz .LBB13_2
-; GFX1150-NEXT: ; %bb.1: ; %frem.else
+; GFX1150-NEXT: ; %bb.1: ; %frem.else16
; GFX1150-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, |v[4:5]|
; GFX1150-NEXT: v_and_b32_e32 v8, 0x80000000, v1
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -17197,7 +17170,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: s_branch .LBB13_8
; GFX1150-NEXT: .LBB13_2:
; GFX1150-NEXT: ; implicit-def: $vgpr8_vgpr9
-; GFX1150-NEXT: .LBB13_3: ; %frem.compute
+; GFX1150-NEXT: .LBB13_3: ; %frem.compute15
; GFX1150-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[0:1]|
; GFX1150-NEXT: v_frexp_exp_i32_f64_e32 v13, v[4:5]
; GFX1150-NEXT: v_frexp_exp_i32_f64_e32 v12, v[0:1]
@@ -17229,12 +17202,12 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v17
; GFX1150-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0
; GFX1150-NEXT: s_cbranch_vccnz .LBB13_7
-; GFX1150-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX1150-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; GFX1150-NEXT: s_sub_i32 s2, s2, s3
; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-NEXT: s_add_i32 s2, s2, 26
; GFX1150-NEXT: .p2align 6
-; GFX1150-NEXT: .LBB13_5: ; %frem.loop_body
+; GFX1150-NEXT: .LBB13_5: ; %frem.loop_body23
; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10
@@ -17254,7 +17227,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: ; %bb.6: ; %Flow51
; GFX1150-NEXT: v_dual_mov_b32 v17, s2 :: v_dual_mov_b32 v10, v14
; GFX1150-NEXT: v_mov_b32_e32 v11, v15
-; GFX1150-NEXT: .LBB13_7: ; %frem.loop_exit
+; GFX1150-NEXT: .LBB13_7: ; %frem.loop_exit24
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-NEXT: v_subrev_nc_u32_e32 v14, 25, v17
; GFX1150-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14
@@ -17274,7 +17247,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: v_cmp_ngt_f64_e64 s2, |v[2:3]|, |v[6:7]|
; GFX1150-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX1150-NEXT: s_cbranch_vccz .LBB13_10
-; GFX1150-NEXT: ; %bb.9: ; %frem.else16
+; GFX1150-NEXT: ; %bb.9: ; %frem.else
; GFX1150-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, |v[6:7]|
; GFX1150-NEXT: v_and_b32_e32 v10, 0x80000000, v3
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -17284,7 +17257,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: s_branch .LBB13_16
; GFX1150-NEXT: .LBB13_10:
; GFX1150-NEXT: ; implicit-def: $vgpr10_vgpr11
-; GFX1150-NEXT: .LBB13_11: ; %frem.compute15
+; GFX1150-NEXT: .LBB13_11: ; %frem.compute
; GFX1150-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[2:3]|
; GFX1150-NEXT: v_frexp_exp_i32_f64_e32 v15, v[6:7]
; GFX1150-NEXT: v_frexp_exp_i32_f64_e32 v14, v[2:3]
@@ -17316,12 +17289,12 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v19
; GFX1150-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0
; GFX1150-NEXT: s_cbranch_vccnz .LBB13_15
-; GFX1150-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; GFX1150-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; GFX1150-NEXT: s_sub_i32 s2, s2, s3
; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-NEXT: s_add_i32 s2, s2, 26
; GFX1150-NEXT: .p2align 6
-; GFX1150-NEXT: .LBB13_13: ; %frem.loop_body23
+; GFX1150-NEXT: .LBB13_13: ; %frem.loop_body
; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-NEXT: v_dual_mov_b32 v17, v13 :: v_dual_mov_b32 v16, v12
@@ -17341,7 +17314,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: ; %bb.14: ; %Flow
; GFX1150-NEXT: v_dual_mov_b32 v19, s2 :: v_dual_mov_b32 v12, v16
; GFX1150-NEXT: v_mov_b32_e32 v13, v17
-; GFX1150-NEXT: .LBB13_15: ; %frem.loop_exit24
+; GFX1150-NEXT: .LBB13_15: ; %frem.loop_exit
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-NEXT: v_subrev_nc_u32_e32 v16, 25, v19
; GFX1150-NEXT: v_ldexp_f64 v[12:13], v[12:13], v16
@@ -17386,7 +17359,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, |v[4:5]|
; GFX1200-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX1200-NEXT: s_cbranch_vccz .LBB13_2
-; GFX1200-NEXT: ; %bb.1: ; %frem.else
+; GFX1200-NEXT: ; %bb.1: ; %frem.else16
; GFX1200-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, |v[4:5]|
; GFX1200-NEXT: v_and_b32_e32 v8, 0x80000000, v1
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -17396,7 +17369,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: s_branch .LBB13_8
; GFX1200-NEXT: .LBB13_2:
; GFX1200-NEXT: ; implicit-def: $vgpr8_vgpr9
-; GFX1200-NEXT: .LBB13_3: ; %frem.compute
+; GFX1200-NEXT: .LBB13_3: ; %frem.compute15
; GFX1200-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[0:1]|
; GFX1200-NEXT: v_frexp_exp_i32_f64_e32 v13, v[4:5]
; GFX1200-NEXT: v_frexp_exp_i32_f64_e32 v12, v[0:1]
@@ -17429,11 +17402,11 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v17
; GFX1200-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0
; GFX1200-NEXT: s_cbranch_vccnz .LBB13_7
-; GFX1200-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX1200-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; GFX1200-NEXT: s_sub_co_i32 s2, s2, s3
; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1200-NEXT: s_add_co_i32 s2, s2, 26
-; GFX1200-NEXT: .LBB13_5: ; %frem.loop_body
+; GFX1200-NEXT: .LBB13_5: ; %frem.loop_body23
; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1200-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10
@@ -17454,7 +17427,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: ; %bb.6: ; %Flow51
; GFX1200-NEXT: v_dual_mov_b32 v17, s2 :: v_dual_mov_b32 v10, v14
; GFX1200-NEXT: v_mov_b32_e32 v11, v15
-; GFX1200-NEXT: .LBB13_7: ; %frem.loop_exit
+; GFX1200-NEXT: .LBB13_7: ; %frem.loop_exit24
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-NEXT: v_subrev_nc_u32_e32 v14, 25, v17
; GFX1200-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14
@@ -17476,7 +17449,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_cbranch_vccz .LBB13_10
-; GFX1200-NEXT: ; %bb.9: ; %frem.else16
+; GFX1200-NEXT: ; %bb.9: ; %frem.else
; GFX1200-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, |v[6:7]|
; GFX1200-NEXT: v_and_b32_e32 v10, 0x80000000, v3
; GFX1200-NEXT: s_wait_alu 0xfffd
@@ -17487,7 +17460,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: s_branch .LBB13_16
; GFX1200-NEXT: .LBB13_10:
; GFX1200-NEXT: ; implicit-def: $vgpr10_vgpr11
-; GFX1200-NEXT: .LBB13_11: ; %frem.compute15
+; GFX1200-NEXT: .LBB13_11: ; %frem.compute
; GFX1200-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[2:3]|
; GFX1200-NEXT: v_frexp_exp_i32_f64_e32 v15, v[6:7]
; GFX1200-NEXT: v_frexp_exp_i32_f64_e32 v14, v[2:3]
@@ -17520,11 +17493,11 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v19
; GFX1200-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0
; GFX1200-NEXT: s_cbranch_vccnz .LBB13_15
-; GFX1200-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; GFX1200-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; GFX1200-NEXT: s_sub_co_i32 s2, s2, s3
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_add_co_i32 s2, s2, 26
-; GFX1200-NEXT: .LBB13_13: ; %frem.loop_body23
+; GFX1200-NEXT: .LBB13_13: ; %frem.loop_body
; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX1200-NEXT: v_dual_mov_b32 v17, v13 :: v_dual_mov_b32 v16, v12
@@ -17547,7 +17520,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: ; %bb.14: ; %Flow
; GFX1200-NEXT: v_dual_mov_b32 v19, s2 :: v_dual_mov_b32 v12, v16
; GFX1200-NEXT: v_mov_b32_e32 v13, v17
-; GFX1200-NEXT: .LBB13_15: ; %frem.loop_exit24
+; GFX1200-NEXT: .LBB13_15: ; %frem.loop_exit
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-NEXT: v_subrev_nc_u32_e32 v16, 25, v19
; GFX1200-NEXT: v_ldexp_f64 v[12:13], v[12:13], v16
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index 3c41cc4..5babe9f 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -1111,15 +1111,11 @@ define void @void_func_v4i8(<4 x i8> %arg0) #0 {
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -1190,18 +1186,15 @@ define void @void_func_v5i8(<5 x i8> %arg0) #0 {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 4
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: buffer_store_b8 v4, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v2
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -1281,28 +1274,22 @@ define void @void_func_v8i8(<8 x i8> %arg0) #0 {
; GFX11-TRUE16-LABEL: void_func_v8i8:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v5.h, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v6.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.l, v1.h
; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v6.l
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v0, v6
-; GFX11-TRUE16-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0
+; GFX11-TRUE16-NEXT: buffer_store_b64 v[2:3], off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: void_func_v8i8:
@@ -1416,44 +1403,34 @@ define void @void_func_v16i8(<16 x i8> %arg0) #0 {
; GFX11-TRUE16-LABEL: void_func_v16i8:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, 0
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.h, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v11.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v10.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v5.h, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v10.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v6.h, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v6.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v0.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v2.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v4, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v14.l
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v14
-; GFX11-TRUE16-NEXT: buffer_store_b128 v[5:8], off, s[0:3], 0
+; GFX11-TRUE16-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: void_func_v16i8:
@@ -1649,78 +1626,59 @@ define void @void_func_v32i8(<32 x i8> %arg0) #0 {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v31, off, s32
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, 0
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v15.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v32.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v3.h, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v7.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v32
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v5.h, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v4.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v32
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v6.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v10, v32
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v0.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v32.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v6.h, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v32
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v32.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v7.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v6.h, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v8.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v4.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v11.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v12.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v13.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v14.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v15.l, v14.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v16.h, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v16.l, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v9.h
; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 16
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.h, v5.h
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v31.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.h, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v13, v32
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v5.l, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v17.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v14, v32
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.h, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v5.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v32
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.h, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v31.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v9, v32
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v10.l, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v8.h
; GFX11-TRUE16-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0
; GFX11-TRUE16-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index f67ab18..234eaa8 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -4985,21 +4985,17 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
-; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2
; GFX11-TRUE16-NEXT: global_store_b32 v[40:41], v0, off
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33
; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4
+; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
@@ -5243,18 +5239,14 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 4
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, 0
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: global_store_b8 v[0:1], v4, off
-; GFX11-TRUE16-NEXT: global_store_b32 v[40:41], v2, off
+; GFX11-TRUE16-NEXT: global_store_b8 v[2:3], v4, off
+; GFX11-TRUE16-NEXT: global_store_b32 v[40:41], v0, off
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33
; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4
@@ -5528,27 +5520,21 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v5.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v4
; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v4.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v0, v4
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1
-; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0
-; GFX11-TRUE16-NEXT: global_store_b64 v[40:41], v[1:2], off
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v3.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v2.l, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: global_store_b64 v[40:41], v[3:4], off
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33
; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1
+; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -5994,73 +5980,53 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v13.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, 0
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.h, v0.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v3.h, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v3.h, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v9.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v12.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v13, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v3.h, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v12.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v3.h, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v12.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v4, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v31.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v2, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v0.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v27.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v1.h, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v2, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v0.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v23.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v1.h, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v20.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v0.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v19.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v12
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v0.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v4.h, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v12.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v12
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v5.h, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v4.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v2.l, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v31.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v2.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.h, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v4.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.h, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v5.h, v5.l
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_store_b128 v[42:43], v[0:3], off
-; GFX11-TRUE16-NEXT: global_store_b128 v[40:41], v[5:8], off
+; GFX11-TRUE16-NEXT: global_store_b128 v[40:41], v[9:12], off
; GFX11-TRUE16-NEXT: s_clause 0x3
; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s33
; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s33 offset:4
diff --git a/llvm/test/CodeGen/AMDGPU/global-constant.ll b/llvm/test/CodeGen/AMDGPU/global-constant.ll
index 866d3a1..b04602a 100644
--- a/llvm/test/CodeGen/AMDGPU/global-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-constant.ll
@@ -12,21 +12,21 @@
; Non-R600 OSes use relocations.
; GCN-DEFAULT: s_getpc_b64 s[[[PC0_LO:[0-9]+]]:[[PC0_HI:[0-9]+]]]
-; GCN-DEFAULT: s_add_u32 s{{[0-9]+}}, s[[PC0_LO]], private1@rel32@lo+4
-; GCN-DEFAULT: s_addc_u32 s{{[0-9]+}}, s[[PC0_HI]], private1@rel32@hi+12
+; GCN-DEFAULT: s_add_u32 s{{[0-9]+}}, s[[PC0_LO]], .Lprivate1@rel32@lo+4
+; GCN-DEFAULT: s_addc_u32 s{{[0-9]+}}, s[[PC0_HI]], .Lprivate1@rel32@hi+12
; GCN-DEFAULT: s_getpc_b64 s[[[PC1_LO:[0-9]+]]:[[PC1_HI:[0-9]+]]]
-; GCN-DEFAULT: s_add_u32 s{{[0-9]+}}, s[[PC1_LO]], private2@rel32@lo+4
-; GCN-DEFAULT: s_addc_u32 s{{[0-9]+}}, s[[PC1_HI]], private2@rel32@hi+12
+; GCN-DEFAULT: s_add_u32 s{{[0-9]+}}, s[[PC1_LO]], .Lprivate2@rel32@lo+4
+; GCN-DEFAULT: s_addc_u32 s{{[0-9]+}}, s[[PC1_HI]], .Lprivate2@rel32@hi+12
; MESA uses absolute relocations.
-; GCN-MESA: s_add_u32 s2, private1@abs32@lo, s4
-; GCN-MESA: s_addc_u32 s3, private1@abs32@hi, s5
+; GCN-MESA: s_add_u32 s2, .Lprivate1@abs32@lo, s4
+; GCN-MESA: s_addc_u32 s3, .Lprivate1@abs32@hi, s5
; PAL uses absolute relocations.
-; GCN-PAL: s_add_u32 s2, private1@abs32@lo, s4
-; GCN-PAL: s_addc_u32 s3, private1@abs32@hi, s5
-; GCN-PAL: s_add_u32 s4, private2@abs32@lo, s4
-; GCN-PAL: s_addc_u32 s5, private2@abs32@hi, s5
+; GCN-PAL: s_add_u32 s2, .Lprivate1@abs32@lo, s4
+; GCN-PAL: s_addc_u32 s3, .Lprivate1@abs32@hi, s5
+; GCN-PAL: s_add_u32 s4, .Lprivate2@abs32@lo, s4
+; GCN-PAL: s_addc_u32 s5, .Lprivate2@abs32@hi, s5
; R600-LABEL: private_test
define amdgpu_kernel void @private_test(i32 %index, ptr addrspace(1) %out) {
diff --git a/llvm/test/CodeGen/AMDGPU/global-variable-relocs.ll b/llvm/test/CodeGen/AMDGPU/global-variable-relocs.ll
index b8cfcbf..6d55e79 100644
--- a/llvm/test/CodeGen/AMDGPU/global-variable-relocs.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-variable-relocs.ll
@@ -14,8 +14,8 @@
; CHECK-LABEL: private_test:
; CHECK: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
-; CHECK: s_add_u32 s[[ADDR_LO:[0-9]+]], s[[PC_LO]], private@rel32@lo+8
-; CHECK: s_addc_u32 s[[ADDR_HI:[0-9]+]], s[[PC_HI]], private@rel32@hi+16
+; CHECK: s_add_u32 s[[ADDR_LO:[0-9]+]], s[[PC_LO]], .Lprivate@rel32@lo+8
+; CHECK: s_addc_u32 s[[ADDR_HI:[0-9]+]], s[[PC_HI]], .Lprivate@rel32@hi+16
; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]]
define amdgpu_kernel void @private_test(ptr addrspace(1) %out) {
%ptr = getelementptr [256 x i32], ptr addrspace(1) @private, i32 0, i32 1
@@ -153,7 +153,7 @@ define amdgpu_kernel void @external_w_init_test(ptr addrspace(1) %out) {
ret void
}
-; CHECK: .local private
+; CHECK: .local .Lprivate
; CHECK: .local internal
; CHECK: .weak linkonce
; CHECK: .weak weak
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index 049663a..f80d50b 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -2730,18 +2730,15 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b16 v1.h, 8, v4.l
; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v3.l, v4.l, v0.l
; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v2.l, v2.l, v6.l
-; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v0.h, v0.h, v1.h
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-DL-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v2.l
-; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v7.h, v6.l
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-DL-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v0.h
+; GFX11-DL-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v0.h
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-DL-TRUE16-NEXT: v_or_b16 v6.h, v1.l, v2.l
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-DL-TRUE16-NEXT: v_or_b32_e32 v1, v7, v6
-; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v6
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index 6b09424..eee232a 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -49,7 +49,6 @@ bb:
ret void
}
-; FIXME: This generates "instid1(/* invalid instid value */)".
define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg4, i1 %arg5, ptr %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i1 %arg11) {
; GFX11-LABEL: f2:
; GFX11: ; %bb.0: ; %bb
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index 792d7db..76016e4 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -850,15 +850,13 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x10
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e64 v2, 16, s4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v1, 16, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
index b07dec3..689d147 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
@@ -6,1153 +6,1147 @@
define amdgpu_kernel void @largeInterleave() #0 { ret void }
; GCN-LABEL: largeInterleave:
; GCN: ; %bb.0:
- ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
- ; GCN-NEXT: ; implicit-def: $vgpr0
- ; GCN-NEXT: ; implicit-def: $vgpr2
- ; GCN-NEXT: ; implicit-def: $vgpr1
- ; GCN-NEXT: ; implicit-def: $vgpr8
- ; GCN-NEXT: ; implicit-def: $vgpr94
- ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
- ; GCN-NEXT: ; implicit-def: $vgpr106
- ; GCN-NEXT: ; implicit-def: $vgpr132
- ; GCN-NEXT: ; implicit-def: $vgpr133
- ; GCN-NEXT: ; implicit-def: $vgpr139
- ; GCN-NEXT: ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127
- ; GCN-NEXT: ; iglp_opt mask(0x00000002)
- ; GCN-NEXT: ; implicit-def: $sgpr0
+ ; GCN-NEXT: ; implicit-def: $vgpr16
+ ; GCN-NEXT: ; implicit-def: $vgpr25
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
- ; GCN-NEXT: v_readfirstlane_b32 s7, v0
+ ; GCN-NEXT: v_readfirstlane_b32 s17, v16
+ ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GCN-NEXT: ; implicit-def: $vgpr17
+ ; GCN-NEXT: ; implicit-def: $sgpr15
; GCN-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11
- ; GCN-NEXT: ; kill: killed $sgpr8_sgpr9_sgpr10_sgpr11
- ; GCN-NEXT: ; implicit-def: $sgpr5
- ; GCN-NEXT: s_nop 1
- ; GCN-NEXT: v_lshl_add_u32 v0, s7, 4, v2
- ; GCN-NEXT: v_mul_lo_u32 v0, v0, s6
- ; GCN-NEXT: v_add_lshl_u32 v92, v0, v1, 1
- ; GCN-NEXT: v_add_u32_e32 v93, s0, v92
- ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v92, s[8:11], 0 offen sc0 sc1
+ ; GCN-NEXT: s_lshl_b32 s18, s17, 7
+ ; GCN-NEXT: ; implicit-def: $vgpr18
+ ; GCN-NEXT: v_add_lshl_u32 v230, v18, s18, 1
+ ; GCN-NEXT: v_lshl_add_u32 v25, s17, 4, v25
+ ; GCN-NEXT: v_mul_lo_u32 v25, v25, s6
+ ; GCN-NEXT: v_add_lshl_u32 v226, v25, v17, 1
+ ; GCN-NEXT: v_add_u32_e32 v17, s15, v226
+ ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v226, s[8:11], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: buffer_load_dwordx4 v[4:7], v93, s[8:11], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v17, s[8:11], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: s_lshl_b32 s0, s7, 7
- ; GCN-NEXT: v_add_lshl_u32 v95, v8, s0, 1
- ; GCN-NEXT: v_add_u32_e32 v8, 64, v93
- ; GCN-NEXT: ; kill: killed $vgpr8
+ ; GCN-NEXT: v_add_u32_e32 v72, 64, v17
+ ; GCN-NEXT: ; implicit-def: $vgpr213
+ ; GCN-NEXT: ; implicit-def: $vgpr152_vgpr153_vgpr154_vgpr155
+ ; GCN-NEXT: ; implicit-def: $vgpr246
+ ; GCN-NEXT: v_add_u32_e32 v188, 0x80, v17
+ ; GCN-NEXT: ; implicit-def: $vgpr156_vgpr157_vgpr158_vgpr159
+ ; GCN-NEXT: ; implicit-def: $vgpr144_vgpr145_vgpr146_vgpr147
+ ; GCN-NEXT: ; implicit-def: $vgpr19
+ ; GCN-NEXT: ; implicit-def: $vgpr26
+ ; GCN-NEXT: ; implicit-def: $vgpr27
+ ; GCN-NEXT: v_add_u32_e32 v227, 0xc0, v17
+ ; GCN-NEXT: v_add_u32_e32 v231, v19, v26
+ ; GCN-NEXT: v_add_u32_e32 v232, v19, v27
; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
- ; GCN-NEXT: ; kill: killed $vgpr92
- ; GCN-NEXT: ; implicit-def: $sgpr6
+ ; GCN-NEXT: ; implicit-def: $vgpr28
+ ; GCN-NEXT: ; implicit-def: $vgpr29
+ ; GCN-NEXT: v_add_u32_e32 v233, v19, v28
+ ; GCN-NEXT: v_add_u32_e32 v234, v19, v29
+ ; GCN-NEXT: ; implicit-def: $vgpr140_vgpr141_vgpr142_vgpr143
+ ; GCN-NEXT: ; implicit-def: $sgpr5
+ ; GCN-NEXT: ; implicit-def: $sgpr7
+ ; GCN-NEXT: ; implicit-def: $vgpr148_vgpr149_vgpr150_vgpr151
+ ; GCN-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139
+ ; GCN-NEXT: ; implicit-def: $vgpr132_vgpr133_vgpr134_vgpr135
+ ; GCN-NEXT: ; implicit-def: $vgpr20
+ ; GCN-NEXT: v_add_u32_e32 v18, s17, v20
+ ; GCN-NEXT: v_and_b32_e32 v18, 0x1fffffff, v18
+ ; GCN-NEXT: ; implicit-def: $sgpr16
+ ; GCN-NEXT: v_mul_lo_u32 v18, v18, s16
+ ; GCN-NEXT: ; implicit-def: $vgpr21
+ ; GCN-NEXT: v_add_lshl_u32 v199, v21, v18, 1
+ ; GCN-NEXT: ; implicit-def: $vgpr22
+ ; GCN-NEXT: v_lshl_add_u32 v200, v22, 1, v199
+ ; GCN-NEXT: ; implicit-def: $vgpr23
+ ; GCN-NEXT: v_lshl_add_u32 v201, v23, 1, v200
+ ; GCN-NEXT: ; implicit-def: $vgpr24
+ ; GCN-NEXT: v_lshl_add_u32 v202, v24, 1, v201
+ ; GCN-NEXT: ; implicit-def: $vgpr16
+ ; GCN-NEXT: ; implicit-def: $vgpr18
+ ; GCN-NEXT: ; implicit-def: $vgpr20
+ ; GCN-NEXT: ; implicit-def: $vgpr24
+ ; GCN-NEXT: v_add_u32_e32 v247, v19, v24
+ ; GCN-NEXT: v_add_u32_e32 v248, v19, v16
+ ; GCN-NEXT: v_add_u32_e32 v249, v19, v18
+ ; GCN-NEXT: v_add_u32_e32 v250, v19, v20
+ ; GCN-NEXT: ; implicit-def: $vgpr128_vgpr129_vgpr130_vgpr131
+ ; GCN-NEXT: ; implicit-def: $sgpr14
+ ; GCN-NEXT: ; implicit-def: $vgpr196
+ ; GCN-NEXT: ; implicit-def: $sgpr12_sgpr13
+ ; GCN-NEXT: ; implicit-def: $vgpr211
+ ; GCN-NEXT: v_max_f32_e32 v212, v211, v211
+ ; GCN-NEXT: ; implicit-def: $vgpr198
+ ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; GCN-NEXT: ; implicit-def: $vgpr32
+ ; GCN-NEXT: ; implicit-def: $vgpr33
+ ; GCN-NEXT: ; implicit-def: $vgpr34
+ ; GCN-NEXT: v_add_u32_e32 v210, v19, v34
+ ; GCN-NEXT: v_add_u32_e32 v206, v19, v33
+ ; GCN-NEXT: v_add_u32_e32 v205, v19, v32
+ ; GCN-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ ; GCN-NEXT: ; implicit-def: $vgpr21
+ ; GCN-NEXT: ; implicit-def: $vgpr22
+ ; GCN-NEXT: ; implicit-def: $vgpr23
+ ; GCN-NEXT: ; implicit-def: $vgpr30
+ ; GCN-NEXT: ; implicit-def: $vgpr31
+ ; GCN-NEXT: v_add_u32_e32 v207, v19, v21
+ ; GCN-NEXT: v_add_u32_e32 v208, v19, v22
+ ; GCN-NEXT: v_add_u32_e32 v209, v19, v23
+ ; GCN-NEXT: v_add_u32_e32 v203, v19, v30
+ ; GCN-NEXT: v_add_u32_e32 v204, v19, v31
+ ; GCN-NEXT: ; kill: killed $vgpr17
+ ; GCN-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ ; GCN-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ ; GCN-NEXT: ; implicit-def: $vgpr197
+ ; GCN-NEXT: ; iglp_opt mask(0x00000002)
; GCN-NEXT: buffer_wbl2 sc0 sc1
- ; GCN-NEXT: ds_write_b128 v95, v[0:3]
+ ; GCN-NEXT: ds_write_b128 v230, v[64:67]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b128 v95, v[4:7] offset:1024
+ ; GCN-NEXT: ds_write_b128 v230, v[68:71] offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:64 sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx4 v[160:163], v226, s[8:11], 0 offen offset:64 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v8, s[8:11], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx4 v[164:167], v72, s[8:11], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: ds_read_b128 v[72:75], v94
+ ; GCN-NEXT: ds_read_b128 v[64:67], v213
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[80:83], v94 offset:512
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[64:65], v[152:153], 0
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[66:67], v[154:155], v[112:127]
+ ; GCN-NEXT: ds_read_b128 v[64:67], v213 offset:512
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[84:87], v94 offset:1024
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[64:65], v[152:153], 0
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[66:67], v[154:155], v[96:111]
+ ; GCN-NEXT: ds_read_b128 v[64:67], v213 offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], 0
- ; GCN-NEXT: ds_read_b128 v[88:91], v94 offset:1536
+ ; GCN-NEXT: ds_read_b128 v[168:171], v213 offset:1536
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
- ; GCN-NEXT: ds_read_b128 v[72:75], v106
+ ; GCN-NEXT: ds_read_b128 v[172:175], v246
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], 0
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], 0
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], 0
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47]
- ; GCN-NEXT: ds_read_b128 v[80:83], v106 offset:512
+ ; GCN-NEXT: ds_read_b128 v[176:179], v246 offset:512
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31]
- ; GCN-NEXT: ds_read_b128 v[84:87], v106 offset:1024
+ ; GCN-NEXT: ds_read_b128 v[180:183], v246 offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15]
- ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
- ; GCN-NEXT: ds_read_b128 v[88:91], v106 offset:1536
+ ; GCN-NEXT: ds_read_b128 v[184:187], v246 offset:1536
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[64:65], v[152:153], 0
; GCN-NEXT: buffer_wbl2 sc0 sc1
- ; GCN-NEXT: ds_write_b128 v95, v[64:67]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
- ; GCN-NEXT: v_add_u32_e32 v72, 0x80, v93
+ ; GCN-NEXT: ds_write_b128 v230, v[160:163]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[66:67], v[154:155], v[80:95]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b128 v95, v[68:71] offset:1024
+ ; GCN-NEXT: ds_write_b128 v230, v[164:167] offset:1024
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[168:169], v[152:153], 0
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[170:171], v[154:155], v[64:79]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:128 sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx4 v[152:155], v226, s[8:11], 0 offen offset:128 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx4 v[160:163], v188, s[8:11], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: ; kill: killed $vgpr72
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
- ; GCN-NEXT: ds_read_b128 v[72:75], v94
+ ; GCN-NEXT: ds_read_b128 v[188:191], v213
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], v[32:47]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], v[16:31]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], v[0:15]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47]
- ; GCN-NEXT: ds_read_b128 v[80:83], v94 offset:512
+ ; GCN-NEXT: ds_read_b128 v[192:195], v213 offset:512
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31]
- ; GCN-NEXT: ds_read_b128 v[84:87], v94 offset:1024
+ ; GCN-NEXT: ds_read_b128 v[164:167], v213 offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15]
- ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
- ; GCN-NEXT: ds_read_b128 v[88:91], v94 offset:1536
+ ; GCN-NEXT: ds_read_b128 v[214:217], v213 offset:1536
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
- ; GCN-NEXT: ds_read_b128 v[72:75], v106
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[172:173], v[156:157], v[112:127]
+ ; GCN-NEXT: ds_read_b128 v[218:221], v246
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], v[32:47]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], v[16:31]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], v[0:15]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15]
- ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
- ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:512
+ ; GCN-NEXT: ds_read_b128 v[222:225], v246 offset:512
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
- ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1024
+ ; GCN-NEXT: ds_read_b128 v[168:171], v246 offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
- ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1536
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[174:175], v[158:159], v[112:127]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[188:189], v[144:145], v[112:127]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[190:191], v[146:147], v[112:127]
+ ; GCN-NEXT: ds_read_b128 v[188:191], v246 offset:1536
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: buffer_wbl2 sc0 sc1
- ; GCN-NEXT: ds_write_b128 v95, v[64:67]
+ ; GCN-NEXT: ds_write_b128 v230, v[152:155]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b128 v95, v[68:71] offset:1024
- ; GCN-NEXT: ; implicit-def: $vgpr64
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
- ; GCN-NEXT: v_add_u32_e32 v72, 0xc0, v93
- ; GCN-NEXT: ; implicit-def: $vgpr73
- ; GCN-NEXT: v_add_u32_e32 v76, v132, v64
+ ; GCN-NEXT: ds_write_b128 v230, v[160:163] offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:192 sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx4 v[152:155], v226, s[8:11], 0 offen offset:192 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[184:185], v[156:157], v[64:79]
+ ; GCN-NEXT: buffer_load_dwordx4 v[226:229], v227, s[8:11], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ; kill: killed $vgpr72
- ; GCN-NEXT: v_add_u32_e32 v72, v132, v73
- ; GCN-NEXT: buffer_load_dwordx2 v[98:99], v76, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[160:161], v231, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: buffer_load_dwordx2 v[102:103], v72, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[162:163], v232, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
- ; GCN-NEXT: ; implicit-def: $vgpr74
- ; GCN-NEXT: v_add_u32_e32 v72, v132, v74
- ; GCN-NEXT: ; implicit-def: $vgpr75
- ; GCN-NEXT: buffer_load_dwordx2 v[100:101], v72, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[172:173], v233, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_add_u32_e32 v72, v132, v75
- ; GCN-NEXT: buffer_load_dwordx2 v[104:105], v72, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[174:175], v234, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: ds_read_b128 v[72:75], v94
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[186:187], v[158:159], v[64:79]
+ ; GCN-NEXT: v_perm_b32 v238, v162, v160, s5
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[218:219], v[140:141], v[112:127]
+ ; GCN-NEXT: v_perm_b32 v240, v162, v160, s7
+ ; GCN-NEXT: v_perm_b32 v242, v163, v161, s5
+ ; GCN-NEXT: v_perm_b32 v244, v163, v161, s7
+ ; GCN-NEXT: ds_read_b128 v[160:163], v213
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ; kill: killed $vgpr76
- ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
- ; GCN-NEXT: ; implicit-def: $sgpr8
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
- ; GCN-NEXT: ds_read_b128 v[72:75], v94 offset:512
+ ; GCN-NEXT: v_perm_b32 v239, v174, v172, s5
+ ; GCN-NEXT: v_perm_b32 v241, v174, v172, s7
+ ; GCN-NEXT: v_perm_b32 v243, v175, v173, s5
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[214:215], v[144:145], v[64:79]
+ ; GCN-NEXT: v_perm_b32 v245, v175, v173, s7
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[176:177], v[156:157], v[96:111]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[220:221], v[142:143], v[112:127]
+ ; GCN-NEXT: ds_read_b128 v[218:221], v213 offset:512
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
- ; GCN-NEXT: ds_read_b128 v[72:75], v94 offset:1024
+ ; GCN-NEXT: ds_read_b128 v[172:175], v213 offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
- ; GCN-NEXT: ds_read_b128 v[72:75], v94 offset:1536
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[216:217], v[146:147], v[64:79]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[178:179], v[158:159], v[96:111]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[160:161], v[148:149], v[112:127]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[188:189], v[140:141], v[64:79]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[192:193], v[144:145], v[96:111]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[162:163], v[150:151], v[112:127]
+ ; GCN-NEXT: ds_read_b128 v[160:163], v213 offset:1536
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
- ; GCN-NEXT: ds_read_b128 v[72:75], v106
+ ; GCN-NEXT: ds_read_b128 v[184:187], v246
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
- ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:512
+ ; GCN-NEXT: ds_read_b128 v[214:217], v246 offset:512
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
- ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1024
+ ; GCN-NEXT: ds_read_b128 v[176:179], v246 offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
- ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1536
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[190:191], v[142:143], v[64:79]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[194:195], v[146:147], v[96:111]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[160:161], v[148:149], v[64:79]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[180:181], v[156:157], v[80:95]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[184:185], v[136:137], v[112:127]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[222:223], v[140:141], v[96:111]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[162:163], v[150:151], v[64:79]
+ ; GCN-NEXT: ds_read_b128 v[160:163], v246 offset:1536
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: buffer_wbl2 sc0 sc1
- ; GCN-NEXT: ds_write_b128 v95, v[64:67]
+ ; GCN-NEXT: ds_write_b128 v230, v[152:155]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b128 v95, v[68:71] offset:1024
+ ; GCN-NEXT: ds_write_b128 v230, v[226:229] offset:1024
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[182:183], v[158:159], v[80:95]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_read_b128 v[64:67], v94
+ ; GCN-NEXT: ds_read_b128 v[156:159], v213
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[90:93], v94 offset:512
+ ; GCN-NEXT: ds_read_b128 v[226:229], v213 offset:512
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
- ; GCN-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71
- ; GCN-NEXT: ds_read_b128 v[84:87], v94 offset:1024
+ ; GCN-NEXT: ds_read_b128 v[180:183], v213 offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[64:65], v[68:69], v[48:63]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
- ; GCN-NEXT: ds_read_b128 v[76:79], v94 offset:1536
+ ; GCN-NEXT: ds_read_b128 v[152:155], v213 offset:1536
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[94:97], v106
+ ; GCN-NEXT: ds_read_b128 v[230:233], v246
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[66:67], v[70:71], v[48:63]
- ; GCN-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[90:91], v[68:69], v[32:47]
- ; GCN-NEXT: ds_read_b128 v[88:91], v106 offset:512
+ ; GCN-NEXT: ds_read_b128 v[234:237], v246 offset:512
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[80:83], v106 offset:1024
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[186:187], v[138:139], v[112:127]
+ ; GCN-NEXT: ds_read_b128 v[184:187], v246 offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[72:75], v106 offset:1536
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[224:225], v[142:143], v[96:111]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[156:157], v[132:133], v[112:127]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[218:219], v[148:149], v[96:111]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[158:159], v[134:135], v[112:127]
+ ; GCN-NEXT: ds_read_b128 v[156:159], v246 offset:1536
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[94:95], v[64:65], v[48:63]
- ; GCN-NEXT: v_perm_b32 v94, v102, v98, s5
- ; GCN-NEXT: v_perm_b32 v98, v102, v98, s8
- ; GCN-NEXT: v_perm_b32 v102, v103, v99, s5
- ; GCN-NEXT: v_perm_b32 v95, v104, v100, s5
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[92:93], v[70:71], v[32:47]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[68:69], v[16:31]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[96:97], v[66:67], v[48:63]
- ; GCN-NEXT: v_perm_b32 v96, v103, v99, s8
- ; GCN-NEXT: v_perm_b32 v99, v104, v100, s8
- ; GCN-NEXT: v_perm_b32 v103, v105, v101, s5
- ; GCN-NEXT: v_perm_b32 v97, v105, v101, s8
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[88:89], v[64:65], v[32:47]
- ; GCN-NEXT: s_nop 5
- ; GCN-NEXT: v_mul_f32_e32 v100, s4, v48
- ; GCN-NEXT: v_mul_f32_e32 v101, s4, v49
- ; GCN-NEXT: v_max3_f32 v92, v100, s6, v101
- ; GCN-NEXT: v_mul_f32_e32 v93, s4, v50
- ; GCN-NEXT: v_mul_f32_e32 v100, s4, v51
- ; GCN-NEXT: v_max3_f32 v92, v92, v93, v100
- ; GCN-NEXT: v_mul_f32_e32 v93, s4, v52
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[70:71], v[16:31]
- ; GCN-NEXT: v_mul_f32_e32 v100, s4, v53
- ; GCN-NEXT: v_max3_f32 v92, v92, v93, v100
- ; GCN-NEXT: v_mul_f32_e32 v84, s4, v54
- ; GCN-NEXT: v_mul_f32_e32 v85, s4, v55
- ; GCN-NEXT: v_max3_f32 v84, v92, v84, v85
- ; GCN-NEXT: v_mul_f32_e32 v85, s4, v56
- ; GCN-NEXT: v_mul_f32_e32 v92, s4, v57
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[76:77], v[68:69], v[0:15]
- ; GCN-NEXT: v_max3_f32 v84, v84, v85, v92
- ; GCN-NEXT: v_mul_f32_e32 v85, s4, v58
- ; GCN-NEXT: v_mul_f32_e32 v88, s4, v59
- ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88
- ; GCN-NEXT: v_mul_f32_e32 v85, s4, v60
- ; GCN-NEXT: v_mul_f32_e32 v88, s4, v61
- ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[90:91], v[66:67], v[32:47]
- ; GCN-NEXT: v_mul_f32_e32 v85, s4, v62
- ; GCN-NEXT: v_mul_f32_e32 v88, s4, v63
- ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88
- ; GCN-NEXT: ; implicit-def: $sgpr6
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[80:81], v[64:65], v[16:31]
- ; GCN-NEXT: s_nop 6
- ; GCN-NEXT: v_mul_f32_e32 v85, s4, v32
- ; GCN-NEXT: v_mul_f32_e32 v88, s4, v33
- ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88
- ; GCN-NEXT: v_mul_f32_e32 v85, s4, v34
- ; GCN-NEXT: v_mul_f32_e32 v88, s4, v35
- ; GCN-NEXT: v_max3_f32 v84, v84, v85, v88
- ; GCN-NEXT: v_mul_f32_e32 v85, s4, v36
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[78:79], v[70:71], v[0:15]
- ; GCN-NEXT: v_mul_f32_e32 v86, s4, v37
- ; GCN-NEXT: v_max3_f32 v84, v84, v85, v86
- ; GCN-NEXT: v_mul_f32_e32 v85, s4, v38
- ; GCN-NEXT: v_mul_f32_e32 v86, s4, v39
- ; GCN-NEXT: v_max3_f32 v84, v84, v85, v86
- ; GCN-NEXT: v_mul_f32_e32 v85, s4, v40
- ; GCN-NEXT: v_mul_f32_e32 v80, s4, v41
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[82:83], v[66:67], v[16:31]
- ; GCN-NEXT: v_max3_f32 v80, v84, v85, v80
- ; GCN-NEXT: v_mul_f32_e32 v81, s4, v42
- ; GCN-NEXT: v_mul_f32_e32 v84, s4, v43
- ; GCN-NEXT: v_max3_f32 v80, v80, v81, v84
- ; GCN-NEXT: v_mul_f32_e32 v81, s4, v44
- ; GCN-NEXT: v_mul_f32_e32 v84, s4, v45
- ; GCN-NEXT: v_max3_f32 v80, v80, v81, v84
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[64:65], v[0:15]
- ; GCN-NEXT: v_mul_f32_e32 v81, s4, v46
- ; GCN-NEXT: v_mul_f32_e32 v82, s4, v47
- ; GCN-NEXT: v_max3_f32 v80, v80, v81, v82
- ; GCN-NEXT: v_mul_f32_e32 v81, s4, v16
- ; GCN-NEXT: v_mul_f32_e32 v82, s4, v17
- ; GCN-NEXT: v_max3_f32 v80, v80, v81, v82
- ; GCN-NEXT: v_mul_f32_e32 v68, s4, v18
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[66:67], v[0:15]
- ; GCN-NEXT: v_mul_f32_e32 v69, s4, v19
- ; GCN-NEXT: v_max3_f32 v68, v80, v68, v69
- ; GCN-NEXT: v_mul_f32_e32 v69, s4, v20
- ; GCN-NEXT: v_mul_f32_e32 v76, s4, v21
- ; GCN-NEXT: v_max3_f32 v68, v68, v69, v76
- ; GCN-NEXT: v_mul_f32_e32 v69, s4, v22
- ; GCN-NEXT: v_mul_f32_e32 v70, s4, v23
- ; GCN-NEXT: v_max3_f32 v68, v68, v69, v70
- ; GCN-NEXT: v_mul_f32_e32 v69, s4, v24
- ; GCN-NEXT: v_mul_f32_e32 v70, s4, v25
- ; GCN-NEXT: v_max3_f32 v68, v68, v69, v70
- ; GCN-NEXT: v_mul_f32_e32 v69, s4, v26
- ; GCN-NEXT: v_mul_f32_e32 v70, s4, v27
- ; GCN-NEXT: v_max3_f32 v64, v68, v69, v70
- ; GCN-NEXT: v_mul_f32_e32 v65, s4, v28
- ; GCN-NEXT: v_mul_f32_e32 v68, s4, v29
- ; GCN-NEXT: v_max3_f32 v64, v64, v65, v68
- ; GCN-NEXT: v_mul_f32_e32 v65, s4, v30
- ; GCN-NEXT: v_mul_f32_e32 v68, s4, v31
- ; GCN-NEXT: v_max3_f32 v64, v64, v65, v68
- ; GCN-NEXT: v_mul_f32_e32 v65, s4, v0
- ; GCN-NEXT: v_mul_f32_e32 v66, s4, v1
- ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66
- ; GCN-NEXT: v_mul_f32_e32 v65, s4, v2
- ; GCN-NEXT: v_mul_f32_e32 v66, s4, v3
- ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66
- ; GCN-NEXT: v_mul_f32_e32 v65, s4, v4
- ; GCN-NEXT: v_mul_f32_e32 v66, s4, v5
- ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66
- ; GCN-NEXT: v_mul_f32_e32 v65, s4, v6
- ; GCN-NEXT: v_mul_f32_e32 v66, s4, v7
- ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66
- ; GCN-NEXT: v_mul_f32_e32 v65, s4, v8
- ; GCN-NEXT: v_mul_f32_e32 v66, s4, v9
- ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66
- ; GCN-NEXT: v_mul_f32_e32 v65, s4, v10
- ; GCN-NEXT: v_mul_f32_e32 v66, s4, v11
- ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66
- ; GCN-NEXT: v_mul_f32_e32 v65, s4, v12
- ; GCN-NEXT: v_mul_f32_e32 v66, s4, v13
- ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66
- ; GCN-NEXT: v_mul_f32_e32 v65, s4, v14
- ; GCN-NEXT: v_mul_f32_e32 v66, s4, v15
- ; GCN-NEXT: v_max3_f32 v64, v64, v65, v66
- ; GCN-NEXT: ; implicit-def: $vgpr65
- ; GCN-NEXT: ; implicit-def: $vgpr66
- ; GCN-NEXT: ; implicit-def: $vgpr68
- ; GCN-NEXT: ; implicit-def: $vgpr67
- ; GCN-NEXT: v_add_u32_e32 v65, s7, v65
- ; GCN-NEXT: v_and_b32_e32 v65, 0x1fffffff, v65
- ; GCN-NEXT: v_mul_lo_u32 v65, v65, s6
- ; GCN-NEXT: v_add_lshl_u32 v135, v66, v65, 1
- ; GCN-NEXT: ds_bpermute_b32 v65, v133, v64
- ; GCN-NEXT: ; implicit-def: $vgpr66
- ; GCN-NEXT: v_lshl_add_u32 v136, v66, 1, v135
- ; GCN-NEXT: ; implicit-def: $vgpr66
- ; GCN-NEXT: v_lshl_add_u32 v137, v66, 1, v136
- ; GCN-NEXT: ; implicit-def: $vgpr66
- ; GCN-NEXT: ; implicit-def: $sgpr6_sgpr7
- ; GCN-NEXT: v_lshl_add_u32 v138, v66, 1, v137
; GCN-NEXT: buffer_wbl2 sc0 sc1
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v135, v[94:95]
- ; GCN-NEXT: v_max_f32_e32 v65, v65, v65
- ; GCN-NEXT: v_max_f32_e32 v64, v64, v65
- ; GCN-NEXT: ds_bpermute_b32 v65, v133, v64
+ ; GCN-NEXT: ds_write_b64 v199, v[238:239]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v136, v[98:99]
+ ; GCN-NEXT: ds_write_b64 v200, v[240:241]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v137, v[102:103]
+ ; GCN-NEXT: ds_write_b64 v201, v[242:243]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v138, v[96:97]
- ; GCN-NEXT: v_add_u32_e32 v68, v132, v68
- ; GCN-NEXT: v_cndmask_b32_e64 v64, v65, v64, s[6:7]
- ; GCN-NEXT: v_max_f32_e32 v64, v64, v64
- ; GCN-NEXT: ; implicit-def: $vgpr65
- ; GCN-NEXT: v_max_f32_e32 v66, v65, v65
- ; GCN-NEXT: v_max_f32_e32 v134, v66, v64
- ; GCN-NEXT: ; implicit-def: $vgpr64
+ ; GCN-NEXT: ds_write_b64 v202, v[244:245]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_load_dwordx2 v[156:157], v68, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[192:193], v247, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_add_u32_e32 v64, v132, v64
- ; GCN-NEXT: buffer_load_dwordx2 v[158:159], v64, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[220:221], v[150:151], v[96:111]
+ ; GCN-NEXT: buffer_load_dwordx2 v[194:195], v248, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ; implicit-def: $vgpr66
- ; GCN-NEXT: v_add_u32_e32 v64, v132, v66
- ; GCN-NEXT: buffer_load_dwordx2 v[128:129], v64, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[218:219], v249, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_add_u32_e32 v64, v132, v67
- ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v64, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[220:221], v250, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_fma_f32 v57, s4, v57, -v134
- ; GCN-NEXT: v_fma_f32 v48, s4, v48, -v134
- ; GCN-NEXT: v_fma_f32 v96, s4, v58, -v134
- ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v57
- ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48
- ; GCN-NEXT: v_fma_f32 v64, s4, v49, -v134
- ; GCN-NEXT: v_exp_f32_e32 v163, v57
- ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v96
- ; GCN-NEXT: v_fma_f32 v66, s4, v50, -v134
- ; GCN-NEXT: v_exp_f32_e32 v164, v57
- ; GCN-NEXT: v_exp_f32_e32 v49, v48
- ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v64
- ; GCN-NEXT: v_fma_f32 v67, s4, v51, -v134
- ; GCN-NEXT: v_exp_f32_e32 v50, v48
- ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v66
- ; GCN-NEXT: v_fma_f32 v68, s4, v52, -v134
- ; GCN-NEXT: v_exp_f32_e32 v51, v48
- ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v67
- ; GCN-NEXT: v_fma_f32 v69, s4, v53, -v134
- ; GCN-NEXT: v_exp_f32_e32 v52, v48
- ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v68
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: v_fma_f32 v70, s4, v54, -v134
- ; GCN-NEXT: v_exp_f32_e32 v53, v48
- ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v69
- ; GCN-NEXT: v_fma_f32 v71, s4, v55, -v134
- ; GCN-NEXT: ds_read_b128 v[140:143], v139
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_exp_f32_e32 v54, v48
- ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v70
- ; GCN-NEXT: v_exp_f32_e32 v55, v48
- ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v71
- ; GCN-NEXT: ds_read_b128 v[144:147], v139 offset:576
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_fma_f32 v66, s4, v56, -v134
- ; GCN-NEXT: v_exp_f32_e32 v56, v48
- ; GCN-NEXT: v_sub_f32_e32 v48, v65, v134
- ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v49
- ; GCN-NEXT: v_cvt_f16_f32_e32 v67, v50
- ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v51
- ; GCN-NEXT: v_cvt_f16_f32_e32 v58, v52
- ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48
- ; GCN-NEXT: ds_read_b128 v[148:151], v139 offset:1152
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_exp_f32_e32 v48, v48
- ; GCN-NEXT: v_pack_b32_f16 v161, v68, v58
- ; GCN-NEXT: v_pack_b32_f16 v160, v64, v67
- ; GCN-NEXT: v_mul_f32_e32 v58, 0x3fb8aa3b, v66
- ; GCN-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79
- ; GCN-NEXT: ds_read_b128 v[152:155], v139 offset:1728
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_fma_f32 v162, s4, v61, -v134
- ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v55
- ; GCN-NEXT: v_cvt_f16_f32_e32 v57, v56
- ; GCN-NEXT: v_pk_mul_f32 v[64:65], v[64:65], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[66:67], v[66:67], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[68:69], v[68:69], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[70:71], v[70:71], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[72:73], v[72:73], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[74:75], v[74:75], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[76:77], v[76:77], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[78:79], v[78:79], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95
- ; GCN-NEXT: v_fma_f32 v59, s4, v59, -v134
- ; GCN-NEXT: v_pk_mul_f32 v[80:81], v[80:81], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[160:161], v[64:79]
- ; GCN-NEXT: v_mul_f32_e64 v82, v82, v48
- ; GCN-NEXT: v_mul_f32_e64 v83, v83, v48
- ; GCN-NEXT: v_mul_f32_e64 v84, v84, v48
- ; GCN-NEXT: v_mul_f32_e64 v85, v85, v48
- ; GCN-NEXT: v_mul_f32_e64 v86, v86, v48
- ; GCN-NEXT: v_mul_f32_e64 v87, v87, v48
- ; GCN-NEXT: v_pk_mul_f32 v[88:89], v[88:89], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[90:91], v[90:91], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[92:93], v[92:93], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[94:95], v[94:95], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111
- ; GCN-NEXT: v_exp_f32_e32 v58, v58
- ; GCN-NEXT: v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[160:161], v[80:95]
- ; GCN-NEXT: v_mul_f32_e64 v98, v98, v48
- ; GCN-NEXT: v_mul_f32_e64 v99, v99, v48
- ; GCN-NEXT: v_mul_f32_e64 v100, v100, v48
- ; GCN-NEXT: v_mul_f32_e64 v101, v101, v48
- ; GCN-NEXT: v_mul_f32_e64 v102, v102, v48
- ; GCN-NEXT: v_mul_f32_e64 v103, v103, v48
- ; GCN-NEXT: v_pk_mul_f32 v[104:105], v[104:105], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[106:107], v[106:107], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[108:109], v[108:109], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[110:111], v[110:111], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pack_b32_f16 v145, v61, v57
- ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v59
- ; GCN-NEXT: v_cvt_f16_f32_e32 v140, v53
- ; GCN-NEXT: v_cvt_f16_f32_e32 v141, v54
- ; GCN-NEXT: v_exp_f32_e32 v59, v57
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[160:161], v[96:111]
- ; GCN-NEXT: v_fma_f32 v60, s4, v60, -v134
- ; GCN-NEXT: v_mul_f32_e64 v112, v112, v48
- ; GCN-NEXT: v_mul_f32_e64 v113, v113, v48
- ; GCN-NEXT: v_mul_f32_e64 v114, v114, v48
- ; GCN-NEXT: v_mul_f32_e64 v115, v115, v48
- ; GCN-NEXT: v_pk_mul_f32 v[116:117], v[116:117], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[118:119], v[118:119], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[120:121], v[120:121], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[122:123], v[122:123], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[124:125], v[124:125], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[126:127], v[126:127], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_fma_f32 v148, s4, v62, -v134
- ; GCN-NEXT: v_pack_b32_f16 v144, v140, v141
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[160:161], v[112:127]
- ; GCN-NEXT: v_fma_f32 v152, s4, v63, -v134
- ; GCN-NEXT: v_mul_f32_e32 v149, 0x3fb8aa3b, v60
- ; GCN-NEXT: ; implicit-def: $vgpr57
- ; GCN-NEXT: ds_read_b128 v[60:63], v57
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_exp_f32_e32 v160, v149
- ; GCN-NEXT: v_fma_f32 v161, s4, v33, -v134
- ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v148
- ; GCN-NEXT: v_cvt_f16_f32_e32 v153, v58
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[144:145], v[64:79]
- ; GCN-NEXT: v_fma_f32 v32, s4, v32, -v134
- ; GCN-NEXT: ds_read_b128 v[140:143], v57 offset:576
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_fma_f32 v40, s4, v40, -v134
- ; GCN-NEXT: v_fma_f32 v44, s4, v44, -v134
- ; GCN-NEXT: v_fma_f32 v16, s4, v16, -v134
- ; GCN-NEXT: v_fma_f32 v166, s4, v20, -v134
- ; GCN-NEXT: v_fma_f32 v24, s4, v24, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[144:145], v[80:95]
- ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v162
- ; GCN-NEXT: v_cvt_f16_f32_e32 v147, v163
- ; GCN-NEXT: v_exp_f32_e32 v162, v146
- ; GCN-NEXT: v_cvt_f16_f32_e32 v146, v164
- ; GCN-NEXT: v_fma_f32 v28, s4, v28, -v134
- ; GCN-NEXT: v_pack_b32_f16 v148, v153, v147
- ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[150:151], v[144:145], v[96:111]
- ; GCN-NEXT: v_exp_f32_e32 v151, v33
- ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v59
- ; GCN-NEXT: v_fma_f32 v150, s4, v34, -v134
- ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v134
- ; GCN-NEXT: v_fma_f32 v12, s4, v12, -v134
- ; GCN-NEXT: v_pack_b32_f16 v149, v146, v33
- ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v152
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[144:145], v[112:127]
- ; GCN-NEXT: v_fma_f32 v152, s4, v35, -v134
- ; GCN-NEXT: v_exp_f32_e32 v153, v33
- ; GCN-NEXT: v_fma_f32 v155, s4, v36, -v134
- ; GCN-NEXT: v_perm_b32 v36, v158, v156, s5
- ; GCN-NEXT: v_cvt_f16_f32_e32 v154, v160
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[60:61], v[148:149], v[64:79]
- ; GCN-NEXT: v_mul_f32_e32 v60, 0x3fb8aa3b, v32
- ; GCN-NEXT: ds_read_b128 v[32:35], v57 offset:1152
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[144:147], v57 offset:1728
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mul_f32_e32 v61, 0x3fb8aa3b, v161
- ; GCN-NEXT: v_exp_f32_e32 v165, v60
- ; GCN-NEXT: v_perm_b32 v60, v158, v156, s8
- ; GCN-NEXT: v_fma_f32 v158, s4, v37, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[140:141], v[148:149], v[80:95]
- ; GCN-NEXT: v_exp_f32_e32 v161, v61
- ; GCN-NEXT: v_perm_b32 v140, v159, v157, s8
- ; GCN-NEXT: v_perm_b32 v37, v130, v128, s5
- ; GCN-NEXT: v_perm_b32 v61, v130, v128, s8
- ; GCN-NEXT: v_perm_b32 v141, v131, v129, s8
+ ; GCN-NEXT: v_perm_b32 v188, v194, v192, s5
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[164:165], v[144:145], v[80:95]
+ ; GCN-NEXT: v_perm_b32 v189, v220, v218, s5
+ ; GCN-NEXT: v_perm_b32 v191, v220, v218, s7
+ ; GCN-NEXT: v_perm_b32 v190, v194, v192, s7
+ ; GCN-NEXT: v_perm_b32 v192, v195, v193, s5
+ ; GCN-NEXT: v_perm_b32 v194, v195, v193, s7
+ ; GCN-NEXT: v_perm_b32 v193, v221, v219, s5
+ ; GCN-NEXT: v_perm_b32 v195, v221, v219, s7
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[166:167], v[146:147], v[80:95]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[168:169], v[140:141], v[80:95]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[170:171], v[142:143], v[80:95]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[172:173], v[148:149], v[80:95]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[214:215], v[136:137], v[96:111]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[174:175], v[150:151], v[80:95]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[216:217], v[138:139], v[96:111]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[176:177], v[136:137], v[80:95]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[226:227], v[132:133], v[96:111]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[178:179], v[138:139], v[80:95]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[160:161], v[136:137], v[64:79]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[230:231], v[128:129], v[112:127]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[228:229], v[134:135], v[96:111]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[180:181], v[132:133], v[80:95]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[162:163], v[138:139], v[64:79]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[232:233], v[130:131], v[112:127]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[234:235], v[128:129], v[96:111]
+ ; GCN-NEXT: s_nop 9
+ ; GCN-NEXT: v_mul_f32_e32 v213, s4, v112
+ ; GCN-NEXT: v_mul_f32_e32 v218, s4, v113
+ ; GCN-NEXT: v_max3_f32 v213, v213, s14, v218
+ ; GCN-NEXT: v_mul_f32_e32 v218, s4, v114
+ ; GCN-NEXT: v_mul_f32_e32 v219, s4, v115
+ ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219
+ ; GCN-NEXT: v_mul_f32_e32 v218, s4, v116
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[182:183], v[134:135], v[80:95]
+ ; GCN-NEXT: v_mul_f32_e32 v219, s4, v117
+ ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219
+ ; GCN-NEXT: v_mul_f32_e32 v218, s4, v118
+ ; GCN-NEXT: v_mul_f32_e32 v219, s4, v119
+ ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219
+ ; GCN-NEXT: v_mul_f32_e32 v218, s4, v120
+ ; GCN-NEXT: v_mul_f32_e32 v219, s4, v121
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[152:153], v[132:133], v[64:79]
+ ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219
+ ; GCN-NEXT: v_mul_f32_e32 v218, s4, v122
+ ; GCN-NEXT: v_mul_f32_e32 v219, s4, v123
+ ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219
+ ; GCN-NEXT: v_mul_f32_e32 v218, s4, v124
+ ; GCN-NEXT: v_mul_f32_e32 v219, s4, v125
+ ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[236:237], v[130:131], v[96:111]
+ ; GCN-NEXT: v_mul_f32_e32 v218, s4, v126
+ ; GCN-NEXT: v_mul_f32_e32 v219, s4, v127
+ ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[184:185], v[128:129], v[80:95]
+ ; GCN-NEXT: s_nop 6
+ ; GCN-NEXT: v_mul_f32_e32 v214, s4, v96
+ ; GCN-NEXT: v_mul_f32_e32 v215, s4, v97
+ ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215
+ ; GCN-NEXT: v_mul_f32_e32 v214, s4, v98
+ ; GCN-NEXT: v_mul_f32_e32 v215, s4, v99
+ ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215
+ ; GCN-NEXT: v_mul_f32_e32 v214, s4, v100
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[154:155], v[134:135], v[64:79]
+ ; GCN-NEXT: v_mul_f32_e32 v215, s4, v101
+ ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215
+ ; GCN-NEXT: v_mul_f32_e32 v214, s4, v102
+ ; GCN-NEXT: v_mul_f32_e32 v215, s4, v103
+ ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215
+ ; GCN-NEXT: v_mul_f32_e32 v214, s4, v104
+ ; GCN-NEXT: v_mul_f32_e32 v215, s4, v105
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[186:187], v[130:131], v[80:95]
+ ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215
+ ; GCN-NEXT: v_mul_f32_e32 v214, s4, v106
+ ; GCN-NEXT: v_mul_f32_e32 v215, s4, v107
+ ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215
+ ; GCN-NEXT: v_mul_f32_e32 v214, s4, v108
+ ; GCN-NEXT: v_mul_f32_e32 v215, s4, v109
+ ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[156:157], v[128:129], v[64:79]
+ ; GCN-NEXT: v_mul_f32_e32 v214, s4, v110
+ ; GCN-NEXT: v_mul_f32_e32 v215, s4, v111
+ ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215
+ ; GCN-NEXT: v_mul_f32_e32 v140, s4, v80
+ ; GCN-NEXT: v_mul_f32_e32 v141, s4, v81
+ ; GCN-NEXT: v_max3_f32 v140, v213, v140, v141
+ ; GCN-NEXT: v_mul_f32_e32 v141, s4, v82
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[158:159], v[130:131], v[64:79]
+ ; GCN-NEXT: v_mul_f32_e32 v142, s4, v83
+ ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142
+ ; GCN-NEXT: v_mul_f32_e32 v141, s4, v84
+ ; GCN-NEXT: v_mul_f32_e32 v142, s4, v85
+ ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142
+ ; GCN-NEXT: v_mul_f32_e32 v141, s4, v86
+ ; GCN-NEXT: v_mul_f32_e32 v142, s4, v87
+ ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142
+ ; GCN-NEXT: v_mul_f32_e32 v141, s4, v88
+ ; GCN-NEXT: v_mul_f32_e32 v142, s4, v89
+ ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142
+ ; GCN-NEXT: v_mul_f32_e32 v141, s4, v90
+ ; GCN-NEXT: v_mul_f32_e32 v142, s4, v91
+ ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142
+ ; GCN-NEXT: v_mul_f32_e32 v141, s4, v92
+ ; GCN-NEXT: v_mul_f32_e32 v142, s4, v93
+ ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142
+ ; GCN-NEXT: v_mul_f32_e32 v141, s4, v94
+ ; GCN-NEXT: v_mul_f32_e32 v142, s4, v95
+ ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142
+ ; GCN-NEXT: v_mul_f32_e32 v128, s4, v64
+ ; GCN-NEXT: v_mul_f32_e32 v129, s4, v65
+ ; GCN-NEXT: v_max3_f32 v128, v140, v128, v129
+ ; GCN-NEXT: v_mul_f32_e32 v129, s4, v66
+ ; GCN-NEXT: v_mul_f32_e32 v130, s4, v67
+ ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130
+ ; GCN-NEXT: v_mul_f32_e32 v129, s4, v68
+ ; GCN-NEXT: v_mul_f32_e32 v130, s4, v69
+ ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130
+ ; GCN-NEXT: v_mul_f32_e32 v129, s4, v70
+ ; GCN-NEXT: v_mul_f32_e32 v130, s4, v71
+ ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130
+ ; GCN-NEXT: v_mul_f32_e32 v129, s4, v72
+ ; GCN-NEXT: v_mul_f32_e32 v130, s4, v73
+ ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130
+ ; GCN-NEXT: v_mul_f32_e32 v129, s4, v74
+ ; GCN-NEXT: v_mul_f32_e32 v130, s4, v75
+ ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130
+ ; GCN-NEXT: v_mul_f32_e32 v129, s4, v76
+ ; GCN-NEXT: v_mul_f32_e32 v130, s4, v77
+ ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130
+ ; GCN-NEXT: v_mul_f32_e32 v129, s4, v78
+ ; GCN-NEXT: v_mul_f32_e32 v130, s4, v79
+ ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130
+ ; GCN-NEXT: ds_bpermute_b32 v129, v196, v128
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: ds_read_b128 v[130:133], v198
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_max_f32_e32 v129, v129, v129
+ ; GCN-NEXT: v_max_f32_e32 v128, v128, v129
+ ; GCN-NEXT: ds_bpermute_b32 v129, v196, v128
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: v_cndmask_b32_e64 v128, v129, v128, s[12:13]
+ ; GCN-NEXT: v_max_f32_e32 v128, v128, v128
+ ; GCN-NEXT: v_max_f32_e32 v128, v212, v128
+ ; GCN-NEXT: v_fma_f32 v113, s4, v113, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v113
+ ; GCN-NEXT: v_fma_f32 v113, s4, v114, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v139, 0x3fb8aa3b, v113
+ ; GCN-NEXT: v_fma_f32 v113, s4, v115, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v140, 0x3fb8aa3b, v113
+ ; GCN-NEXT: v_fma_f32 v113, s4, v116, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v141, 0x3fb8aa3b, v113
+ ; GCN-NEXT: v_fma_f32 v113, s4, v117, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v113
+ ; GCN-NEXT: v_fma_f32 v113, s4, v118, -v128
+ ; GCN-NEXT: v_fma_f32 v112, s4, v112, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v143, 0x3fb8aa3b, v113
+ ; GCN-NEXT: v_fma_f32 v113, s4, v119, -v128
+ ; GCN-NEXT: v_fma_f32 v118, s4, v120, -v128
+ ; GCN-NEXT: v_fma_f32 v120, s4, v121, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v112, 0x3fb8aa3b, v112
+ ; GCN-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v113
+ ; GCN-NEXT: v_mul_f32_e32 v149, 0x3fb8aa3b, v120
+ ; GCN-NEXT: v_fma_f32 v120, s4, v122, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v114, v138
+ ; GCN-NEXT: v_exp_f32_e32 v115, v139
+ ; GCN-NEXT: v_exp_f32_e32 v116, v140
+ ; GCN-NEXT: v_exp_f32_e32 v117, v141
+ ; GCN-NEXT: v_mul_f32_e32 v148, 0x3fb8aa3b, v118
+ ; GCN-NEXT: v_exp_f32_e32 v118, v142
+ ; GCN-NEXT: v_mul_f32_e32 v150, 0x3fb8aa3b, v120
+ ; GCN-NEXT: v_exp_f32_e32 v120, v144
+ ; GCN-NEXT: v_exp_f32_e32 v113, v112
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v119, v114
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v121, v116
+ ; GCN-NEXT: v_sub_f32_e32 v129, v211, v128
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v112, v113
+ ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v129
+ ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_fma_f32 v122, s4, v123, -v128
+ ; GCN-NEXT: v_pack_b32_f16 v146, v112, v119
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v112, v115
+ ; GCN-NEXT: v_mul_f32_e32 v151, 0x3fb8aa3b, v122
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v123, v117
+ ; GCN-NEXT: v_fma_f32 v122, s4, v124, -v128
+ ; GCN-NEXT: v_pack_b32_f16 v147, v112, v121
+ ; GCN-NEXT: v_exp_f32_e32 v112, v129
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v124, v118
+ ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v122
+ ; GCN-NEXT: v_fma_f32 v125, s4, v125, -v128
+ ; GCN-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[32:33], v[32:33], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[34:35], v[34:35], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[146:147], v[0:15]
+ ; GCN-NEXT: v_exp_f32_e32 v119, v143
+ ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:1728
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_pk_mul_f32 v[36:37], v[36:37], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[38:39], v[38:39], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[40:41], v[40:41], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[42:43], v[42:43], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[44:45], v[44:45], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[46:47], v[46:47], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[16:17], v[16:17], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[18:19], v[18:19], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[134:135], v[146:147], v[32:47]
+ ; GCN-NEXT: v_mul_f32_e64 v20, v20, v112
+ ; GCN-NEXT: v_mul_f32_e64 v21, v21, v112
+ ; GCN-NEXT: v_mul_f32_e64 v22, v22, v112
+ ; GCN-NEXT: v_mul_f32_e64 v23, v23, v112
+ ; GCN-NEXT: v_mul_f32_e64 v24, v24, v112
+ ; GCN-NEXT: v_mul_f32_e64 v25, v25, v112
+ ; GCN-NEXT: v_pk_mul_f32 v[26:27], v[26:27], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[48:49], v[48:49], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[50:51], v[50:51], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[52:53], v[52:53], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[54:55], v[54:55], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[56:57], v[56:57], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[58:59], v[58:59], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[60:61], v[60:61], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[62:63], v[62:63], v[112:113] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pack_b32_f16 v134, v123, v124
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v119
+ ; GCN-NEXT: v_fma_f32 v124, s4, v126, -v128
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v120
+ ; GCN-NEXT: v_exp_f32_e32 v121, v148
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[138:139], v[146:147], v[16:31]
+ ; GCN-NEXT: v_exp_f32_e32 v122, v149
+ ; GCN-NEXT: v_pack_b32_f16 v135, v130, v126
+ ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v124
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v121
+ ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v125
+ ; GCN-NEXT: v_fma_f32 v139, s4, v96, -v128
+ ; GCN-NEXT: v_fma_f32 v127, s4, v127, -v128
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[142:143], v[146:147], v[48:63]
+ ; GCN-NEXT: v_exp_f32_e32 v123, v150
+ ; GCN-NEXT: v_mul_f32_e32 v127, 0x3fb8aa3b, v127
+ ; GCN-NEXT: v_fma_f32 v143, s4, v101, -v128
+ ; GCN-NEXT: v_fma_f32 v64, s4, v64, -v128
+ ; GCN-NEXT: v_fma_f32 v65, s4, v65, -v128
+ ; GCN-NEXT: v_fma_f32 v68, s4, v68, -v128
+ ; GCN-NEXT: v_fma_f32 v69, s4, v69, -v128
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[134:135], v[0:15]
+ ; GCN-NEXT: v_exp_f32_e32 v124, v151
+ ; GCN-NEXT: ds_read_b128 v[130:133], v197
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[146:149], v197 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[136:137], v[134:135], v[32:47]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v122
+ ; GCN-NEXT: v_exp_f32_e32 v96, v129
+ ; GCN-NEXT: v_fma_f32 v137, s4, v97, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v139
+ ; GCN-NEXT: v_pack_b32_f16 v126, v126, v136
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v123
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[140:141], v[134:135], v[16:31]
+ ; GCN-NEXT: v_exp_f32_e32 v97, v125
+ ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v137
+ ; GCN-NEXT: v_fma_f32 v137, s4, v98, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v137
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[144:145], v[134:135], v[48:63]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v124
+ ; GCN-NEXT: v_fma_f32 v135, s4, v99, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v98, v138
+ ; GCN-NEXT: v_exp_f32_e32 v99, v127
+ ; GCN-NEXT: v_mul_f32_e32 v150, 0x3fb8aa3b, v135
+ ; GCN-NEXT: v_pack_b32_f16 v127, v136, v134
+ ; GCN-NEXT: ds_read_b128 v[134:137], v197 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[138:141], v197 offset:1728
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[126:127], v[0:15]
+ ; GCN-NEXT: v_fma_f32 v131, s4, v100, -v128
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v96
+ ; GCN-NEXT: v_exp_f32_e32 v100, v129
+ ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v131
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v131, v97
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: buffer_wbl2 sc0 sc1
- ; GCN-NEXT: ds_write_b64 v135, v[36:37]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[148:149], v[96:111]
- ; GCN-NEXT: v_perm_b32 v32, v159, v157, s5
- ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v150
- ; GCN-NEXT: v_cvt_f16_f32_e32 v150, v151
- ; GCN-NEXT: v_fma_f32 v157, s4, v38, -v134
- ; GCN-NEXT: v_cvt_f16_f32_e32 v38, v153
- ; GCN-NEXT: v_exp_f32_e32 v159, v33
- ; GCN-NEXT: v_perm_b32 v33, v131, v129, s5
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[144:145], v[148:149], v[112:127]
- ; GCN-NEXT: v_pack_b32_f16 v129, v150, v38
- ; GCN-NEXT: v_mul_f32_e32 v38, 0x3fb8aa3b, v152
- ; GCN-NEXT: v_exp_f32_e32 v152, v38
+ ; GCN-NEXT: ds_write_b64 v199, v[188:189]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v136, v[60:61]
+ ; GCN-NEXT: ds_write_b64 v200, v[190:191]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v137, v[32:33]
- ; GCN-NEXT: ; implicit-def: $vgpr33
- ; GCN-NEXT: ; implicit-def: $vgpr38
+ ; GCN-NEXT: ds_write_b64 v201, v[192:193]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v138, v[140:141]
- ; GCN-NEXT: v_add_u32_e32 v38, v132, v38
- ; GCN-NEXT: v_add_u32_e32 v33, v132, v33
+ ; GCN-NEXT: ds_write_b64 v202, v[194:195]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[126:127], v[32:47]
+ ; GCN-NEXT: v_exp_f32_e32 v101, v125
+ ; GCN-NEXT: v_pack_b32_f16 v146, v130, v131
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v38, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v210, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: buffer_load_dwordx2 v[140:141], v33, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v143
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v147, v98
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[126:127], v[16:31]
+ ; GCN-NEXT: v_fma_f32 v134, s4, v102, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v156, 0x3fb8aa3b, v134
+ ; GCN-NEXT: buffer_load_dwordx2 v[134:135], v207, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ; implicit-def: $vgpr36
- ; GCN-NEXT: v_add_u32_e32 v33, v132, v36
- ; GCN-NEXT: ; implicit-def: $vgpr37
- ; GCN-NEXT: buffer_load_dwordx2 v[144:145], v33, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: v_exp_f32_e32 v102, v142
+ ; GCN-NEXT: buffer_load_dwordx2 v[142:143], v208, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_add_u32_e32 v33, v132, v37
- ; GCN-NEXT: buffer_load_dwordx2 v[148:149], v33, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[144:145], v209, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_cvt_f16_f32_e32 v156, v162
- ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v155
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v165
- ; GCN-NEXT: v_pack_b32_f16 v128, v154, v156
- ; GCN-NEXT: v_fma_f32 v150, s4, v39, -v134
- ; GCN-NEXT: ds_read_b128 v[36:39], v139
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[62:63], v[128:129], v[64:79]
- ; GCN-NEXT: v_exp_f32_e32 v154, v32
- ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v158
- ; GCN-NEXT: ds_read_b128 v[60:63], v139 offset:576
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_fma_f32 v156, s4, v42, -v134
- ; GCN-NEXT: v_perm_b32 v20, v140, v130, s5
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[142:143], v[128:129], v[80:95]
- ; GCN-NEXT: v_exp_f32_e32 v155, v32
- ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v157
- ; GCN-NEXT: v_cvt_f16_f32_e32 v142, v161
- ; GCN-NEXT: v_fma_f32 v143, s4, v41, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[128:129], v[96:111]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v159
- ; GCN-NEXT: v_exp_f32_e32 v157, v32
- ; GCN-NEXT: v_cvt_f16_f32_e32 v32, v152
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[146:147], v[128:129], v[112:127]
- ; GCN-NEXT: v_pack_b32_f16 v129, v34, v32
- ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v150
- ; GCN-NEXT: v_pack_b32_f16 v128, v33, v142
- ; GCN-NEXT: v_exp_f32_e32 v146, v32
- ; GCN-NEXT: ds_read_b128 v[32:35], v139 offset:1152
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_fma_f32 v142, s4, v43, -v134
- ; GCN-NEXT: v_fma_f32 v150, s4, v46, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[128:129], v[64:79]
- ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v40
- ; GCN-NEXT: ds_read_b128 v[40:43], v139 offset:1728
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_exp_f32_e32 v147, v36
- ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v143
- ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v154
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[60:61], v[128:129], v[80:95]
- ; GCN-NEXT: v_exp_f32_e32 v143, v36
- ; GCN-NEXT: v_cvt_f16_f32_e32 v60, v155
- ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v142
- ; GCN-NEXT: v_fma_f32 v61, s4, v45, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[128:129], v[96:111]
- ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v156
- ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v157
- ; GCN-NEXT: v_exp_f32_e32 v156, v32
- ; GCN-NEXT: v_cvt_f16_f32_e32 v32, v146
- ; GCN-NEXT: v_pack_b32_f16 v33, v33, v32
- ; GCN-NEXT: v_pack_b32_f16 v32, v37, v60
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[40:41], v[128:129], v[112:127]
- ; GCN-NEXT: v_exp_f32_e32 v129, v36
- ; GCN-NEXT: v_mul_f32_e32 v40, 0x3fb8aa3b, v44
- ; GCN-NEXT: v_cvt_f16_f32_e32 v60, v147
- ; GCN-NEXT: v_fma_f32 v128, s4, v47, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79]
- ; GCN-NEXT: ds_read_b128 v[36:39], v57
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_exp_f32_e32 v142, v40
- ; GCN-NEXT: v_mul_f32_e32 v40, 0x3fb8aa3b, v61
- ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v143
- ; GCN-NEXT: ds_read_b128 v[44:47], v57 offset:576
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[62:63], v[32:33], v[80:95]
- ; GCN-NEXT: v_fma_f32 v62, s4, v17, -v134
- ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v150
- ; GCN-NEXT: v_exp_f32_e32 v63, v40
- ; GCN-NEXT: v_pack_b32_f16 v40, v60, v61
- ; GCN-NEXT: v_fma_f32 v150, s4, v18, -v134
- ; GCN-NEXT: v_fma_f32 v60, s4, v19, -v134
- ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v142
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[32:33], v[96:111]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v156
- ; GCN-NEXT: v_exp_f32_e32 v158, v17
- ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v129
- ; GCN-NEXT: v_pack_b32_f16 v41, v34, v17
- ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v128
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[42:43], v[32:33], v[112:127]
- ; GCN-NEXT: v_exp_f32_e32 v128, v17
- ; GCN-NEXT: v_perm_b32 v42, v141, v131, s8
- ; GCN-NEXT: v_perm_b32 v43, v149, v145, s8
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[40:41], v[64:79]
- ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v16
- ; GCN-NEXT: ds_read_b128 v[16:19], v57 offset:1152
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[32:35], v57 offset:1728
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mul_f32_e32 v37, 0x3fb8aa3b, v62
- ; GCN-NEXT: v_exp_f32_e32 v167, v36
- ; GCN-NEXT: v_perm_b32 v36, v140, v130, s8
- ; GCN-NEXT: v_fma_f32 v62, s4, v21, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[44:45], v[40:41], v[80:95]
- ; GCN-NEXT: v_exp_f32_e32 v130, v37
- ; GCN-NEXT: v_cvt_f16_f32_e32 v45, v158
- ; GCN-NEXT: v_perm_b32 v21, v148, v144, s5
- ; GCN-NEXT: v_perm_b32 v37, v148, v144, s8
- ; GCN-NEXT: v_cvt_f16_f32_e32 v44, v63
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[126:127], v[48:63]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v99
+ ; GCN-NEXT: v_fma_f32 v127, s4, v103, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v103, v150
+ ; GCN-NEXT: v_fma_f32 v139, s4, v105, -v128
+ ; GCN-NEXT: v_pack_b32_f16 v147, v147, v126
+ ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v127
+ ; GCN-NEXT: v_perm_b32 v152, v135, v131, s5
+ ; GCN-NEXT: v_perm_b32 v154, v135, v131, s7
+ ; GCN-NEXT: v_fma_f32 v135, s4, v104, -v128
+ ; GCN-NEXT: v_perm_b32 v126, v134, v130, s5
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[146:147], v[0:15]
+ ; GCN-NEXT: v_perm_b32 v150, v134, v130, s7
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v100
+ ; GCN-NEXT: v_exp_f32_e32 v104, v129
+ ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v135
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v135, v101
+ ; GCN-NEXT: ds_read_b128 v[130:133], v198
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_perm_b32 v127, v144, v142, s5
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[146:147], v[32:47]
+ ; GCN-NEXT: v_pack_b32_f16 v148, v134, v135
+ ; GCN-NEXT: v_fma_f32 v135, s4, v106, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v105, v125
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v102
+ ; GCN-NEXT: v_perm_b32 v151, v144, v142, s7
+ ; GCN-NEXT: v_perm_b32 v153, v145, v143, s5
+ ; GCN-NEXT: v_perm_b32 v155, v145, v143, s7
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[146:147], v[16:31]
+ ; GCN-NEXT: v_exp_f32_e32 v106, v156
+ ; GCN-NEXT: v_mul_f32_e32 v156, 0x3fb8aa3b, v135
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v135, v103
+ ; GCN-NEXT: v_fma_f32 v136, s4, v107, -v128
+ ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v139
+ ; GCN-NEXT: v_pack_b32_f16 v149, v134, v135
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[146:147], v[48:63]
+ ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v136
+ ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_exp_f32_e32 v107, v138
+ ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1728
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[148:149], v[0:15]
+ ; GCN-NEXT: v_fma_f32 v131, s4, v108, -v128
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v104
+ ; GCN-NEXT: v_exp_f32_e32 v108, v129
+ ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v131
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v131, v105
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[142:143], v[148:149], v[32:47]
+ ; GCN-NEXT: v_fma_f32 v142, s4, v109, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v109, v125
+ ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v142
+ ; GCN-NEXT: v_pack_b32_f16 v142, v130, v131
+ ; GCN-NEXT: v_fma_f32 v131, s4, v110, -v128
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v106
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[148:149], v[16:31]
+ ; GCN-NEXT: v_mul_f32_e32 v134, 0x3fb8aa3b, v131
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v131, v107
+ ; GCN-NEXT: v_exp_f32_e32 v110, v156
+ ; GCN-NEXT: v_fma_f32 v135, s4, v111, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v135, 0x3fb8aa3b, v135
+ ; GCN-NEXT: v_pack_b32_f16 v143, v130, v131
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[148:149], v[48:63]
+ ; GCN-NEXT: v_exp_f32_e32 v111, v146
+ ; GCN-NEXT: v_fma_f32 v139, s4, v80, -v128
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v138, v108
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[142:143], v[0:15]
+ ; GCN-NEXT: v_exp_f32_e32 v80, v129
+ ; GCN-NEXT: ds_read_b128 v[130:133], v197
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[146:149], v197 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v139
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v139, v109
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[144:145], v[142:143], v[32:47]
+ ; GCN-NEXT: v_fma_f32 v144, s4, v81, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v81, v125
+ ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v144
+ ; GCN-NEXT: v_pack_b32_f16 v144, v138, v139
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[142:143], v[16:31]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v110
+ ; GCN-NEXT: v_fma_f32 v137, s4, v82, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v82, v134
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v111
+ ; GCN-NEXT: v_mul_f32_e32 v156, 0x3fb8aa3b, v137
+ ; GCN-NEXT: v_fma_f32 v137, s4, v83, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v157, 0x3fb8aa3b, v137
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[142:143], v[48:63]
+ ; GCN-NEXT: v_exp_f32_e32 v83, v135
+ ; GCN-NEXT: v_pack_b32_f16 v145, v136, v134
+ ; GCN-NEXT: ds_read_b128 v[134:137], v197 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[138:141], v197 offset:1728
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: buffer_wbl2 sc0 sc1
- ; GCN-NEXT: ds_write_b64 v135, v[20:21]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[40:41], v[96:111]
- ; GCN-NEXT: v_perm_b32 v16, v141, v131, s5
- ; GCN-NEXT: v_fma_f32 v131, s4, v22, -v134
- ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v128
- ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v150
- ; GCN-NEXT: v_exp_f32_e32 v140, v17
- ; GCN-NEXT: v_perm_b32 v17, v149, v145, s5
+ ; GCN-NEXT: ds_write_b64 v199, v[126:127]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v136, v[36:37]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[32:33], v[40:41], v[112:127]
- ; GCN-NEXT: v_pack_b32_f16 v33, v45, v22
- ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v60
- ; GCN-NEXT: v_exp_f32_e32 v144, v22
+ ; GCN-NEXT: ds_write_b64 v200, v[150:151]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[144:145], v[0:15]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v137, v[16:17]
- ; GCN-NEXT: ; implicit-def: $vgpr17
- ; GCN-NEXT: ; implicit-def: $vgpr22
+ ; GCN-NEXT: ds_write_b64 v201, v[152:153]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v138, v[42:43]
- ; GCN-NEXT: v_add_u32_e32 v22, v132, v22
- ; GCN-NEXT: v_add_u32_e32 v17, v132, v17
- ; GCN-NEXT: ; implicit-def: $vgpr20
- ; GCN-NEXT: ; implicit-def: $vgpr21
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_load_dwordx2 v[40:41], v22, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: ds_write_b64 v202, v[154:155]
+ ; GCN-NEXT: v_fma_f32 v127, s4, v84, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v84, v129
+ ; GCN-NEXT: v_fma_f32 v130, s4, v85, -v128
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v80
+ ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v127
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[144:145], v[32:47]
+ ; GCN-NEXT: v_exp_f32_e32 v85, v125
+ ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v130
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v206, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: buffer_load_dwordx2 v[42:43], v17, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v81
+ ; GCN-NEXT: v_pack_b32_f16 v126, v126, v127
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[144:145], v[16:31]
+ ; GCN-NEXT: v_fma_f32 v134, s4, v86, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v158, 0x3fb8aa3b, v134
+ ; GCN-NEXT: buffer_load_dwordx2 v[134:135], v203, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_add_u32_e32 v20, v132, v20
- ; GCN-NEXT: v_add_u32_e32 v21, v132, v21
- ; GCN-NEXT: v_pack_b32_f16 v32, v61, v44
- ; GCN-NEXT: buffer_load_dwordx2 v[44:45], v20, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[142:143], v204, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: buffer_load_dwordx2 v[60:61], v21, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[146:147], v205, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v166
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79]
- ; GCN-NEXT: v_exp_f32_e32 v132, v16
- ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v62
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v82
+ ; GCN-NEXT: v_exp_f32_e32 v86, v156
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[144:145], v[48:63]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v138, v83
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v167
- ; GCN-NEXT: v_fma_f32 v141, s4, v23, -v134
- ; GCN-NEXT: ds_read_b128 v[20:23], v139
+ ; GCN-NEXT: v_fma_f32 v139, s4, v87, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v87, v157
+ ; GCN-NEXT: v_pack_b32_f16 v127, v127, v138
+ ; GCN-NEXT: v_fma_f32 v138, s4, v89, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v139, 0x3fb8aa3b, v139
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[126:127], v[0:15]
+ ; GCN-NEXT: ; implicit-def: $sgpr0
+ ; GCN-NEXT: v_perm_b32 v154, v135, v131, s5
+ ; GCN-NEXT: v_perm_b32 v156, v135, v131, s7
+ ; GCN-NEXT: v_fma_f32 v135, s4, v88, -v128
+ ; GCN-NEXT: v_perm_b32 v150, v134, v130, s5
+ ; GCN-NEXT: v_perm_b32 v152, v134, v130, s7
+ ; GCN-NEXT: ds_read_b128 v[130:133], v198
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v84
+ ; GCN-NEXT: v_exp_f32_e32 v88, v129
+ ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v135
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v135, v85
+ ; GCN-NEXT: v_perm_b32 v151, v146, v142, s5
+ ; GCN-NEXT: v_perm_b32 v153, v146, v142, s7
+ ; GCN-NEXT: v_perm_b32 v155, v147, v143, s5
+ ; GCN-NEXT: v_perm_b32 v157, v147, v143, s7
+ ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[126:127], v[32:47]
+ ; GCN-NEXT: v_exp_f32_e32 v89, v125
+ ; GCN-NEXT: v_pack_b32_f16 v146, v134, v135
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v86
+ ; GCN-NEXT: v_fma_f32 v135, s4, v90, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v138
+ ; GCN-NEXT: v_mul_f32_e32 v148, 0x3fb8aa3b, v135
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[126:127], v[16:31]
+ ; GCN-NEXT: v_exp_f32_e32 v90, v158
+ ; GCN-NEXT: v_mul_f32_e32 v158, 0x3fb8aa3b, v64
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[126:127], v[48:63]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v87
+ ; GCN-NEXT: v_fma_f32 v127, s4, v91, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v91, v139
+ ; GCN-NEXT: v_mul_f32_e32 v127, 0x3fb8aa3b, v127
+ ; GCN-NEXT: v_pack_b32_f16 v147, v134, v126
+ ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1728
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[146:147], v[0:15]
+ ; GCN-NEXT: v_fma_f32 v130, s4, v92, -v128
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v88
+ ; GCN-NEXT: v_exp_f32_e32 v92, v129
+ ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v130
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v89
+ ; GCN-NEXT: v_fma_f32 v131, s4, v93, -v128
+ ; GCN-NEXT: v_pack_b32_f16 v130, v126, v130
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[142:143], v[146:147], v[32:47]
+ ; GCN-NEXT: v_exp_f32_e32 v93, v125
+ ; GCN-NEXT: v_fma_f32 v126, s4, v94, -v128
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v125, v90
+ ; GCN-NEXT: v_mul_f32_e32 v143, 0x3fb8aa3b, v126
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v91
+ ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v131
+ ; GCN-NEXT: v_fma_f32 v131, s4, v95, -v128
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[146:147], v[16:31]
+ ; GCN-NEXT: v_exp_f32_e32 v94, v148
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v93
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[146:147], v[48:63]
+ ; GCN-NEXT: v_exp_f32_e32 v95, v127
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v92
+ ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v131
+ ; GCN-NEXT: v_pack_b32_f16 v131, v125, v126
+ ; GCN-NEXT: s_nop 1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[130:131], v[0:15]
+ ; GCN-NEXT: v_exp_f32_e32 v125, v129
+ ; GCN-NEXT: ds_read_b128 v[132:135], v197
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[36:39], v139 offset:576
+ ; GCN-NEXT: ds_read_b128 v[146:149], v197 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[144:145], v[130:131], v[32:47]
+ ; GCN-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v65
+ ; GCN-NEXT: v_fma_f32 v65, s4, v66, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v126, v142
+ ; GCN-NEXT: v_pack_b32_f16 v142, v127, v64
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v94
+ ; GCN-NEXT: v_mul_f32_e32 v145, 0x3fb8aa3b, v65
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v65, v95
+ ; GCN-NEXT: v_fma_f32 v66, s4, v67, -v128
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[130:131], v[16:31]
+ ; GCN-NEXT: v_exp_f32_e32 v127, v143
+ ; GCN-NEXT: v_pack_b32_f16 v143, v64, v65
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[130:131], v[48:63]
+ ; GCN-NEXT: v_exp_f32_e32 v129, v138
+ ; GCN-NEXT: v_mul_f32_e32 v141, 0x3fb8aa3b, v66
+ ; GCN-NEXT: ds_read_b128 v[64:67], v197 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[136:139], v197 offset:1728
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[46:47], v[32:33], v[80:95]
- ; GCN-NEXT: v_exp_f32_e32 v62, v16
- ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v131
- ; GCN-NEXT: v_cvt_f16_f32_e32 v46, v130
- ; GCN-NEXT: v_fma_f32 v47, s4, v25, -v134
- ; GCN-NEXT: v_fma_f32 v131, s4, v26, -v134
- ; GCN-NEXT: v_fma_f32 v149, s4, v4, -v134
- ; GCN-NEXT: ; implicit-def: $sgpr0
- ; GCN-NEXT: v_perm_b32 v4, v42, v40, s5
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[32:33], v[96:111]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v140
- ; GCN-NEXT: v_exp_f32_e32 v145, v16
- ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v144
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[34:35], v[32:33], v[112:127]
- ; GCN-NEXT: v_pack_b32_f16 v33, v18, v16
- ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v141
- ; GCN-NEXT: v_pack_b32_f16 v32, v17, v46
- ; GCN-NEXT: v_exp_f32_e32 v35, v16
- ; GCN-NEXT: ds_read_b128 v[16:19], v139 offset:1152
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_fma_f32 v34, s4, v27, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[20:21], v[32:33], v[64:79]
- ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v24
- ; GCN-NEXT: ds_read_b128 v[24:27], v139 offset:1728
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_exp_f32_e32 v46, v20
- ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v47
- ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v132
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[36:37], v[32:33], v[80:95]
- ; GCN-NEXT: v_exp_f32_e32 v47, v20
- ; GCN-NEXT: v_cvt_f16_f32_e32 v36, v62
- ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v34
- ; GCN-NEXT: v_fma_f32 v37, s4, v29, -v134
- ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v46
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[32:33], v[96:111]
- ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v131
- ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v145
- ; GCN-NEXT: v_exp_f32_e32 v141, v16
- ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v35
- ; GCN-NEXT: v_fma_f32 v131, s4, v30, -v134
- ; GCN-NEXT: v_pack_b32_f16 v17, v17, v16
- ; GCN-NEXT: v_pack_b32_f16 v16, v21, v36
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[24:25], v[32:33], v[112:127]
- ; GCN-NEXT: v_exp_f32_e32 v33, v20
- ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v28
- ; GCN-NEXT: v_fma_f32 v32, s4, v31, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79]
- ; GCN-NEXT: ds_read_b128 v[20:23], v57
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_exp_f32_e32 v36, v24
- ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v37
- ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v47
- ; GCN-NEXT: ds_read_b128 v[28:31], v57 offset:576
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[38:39], v[16:17], v[80:95]
- ; GCN-NEXT: v_fma_f32 v38, s4, v1, -v134
- ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v131
- ; GCN-NEXT: v_exp_f32_e32 v39, v24
- ; GCN-NEXT: v_pack_b32_f16 v24, v34, v37
- ; GCN-NEXT: v_fma_f32 v131, s4, v2, -v134
- ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v36
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[16:17], v[96:111]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v141
- ; GCN-NEXT: v_exp_f32_e32 v148, v1
- ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v33
- ; GCN-NEXT: v_pack_b32_f16 v25, v18, v1
- ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v32
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[26:27], v[16:17], v[112:127]
- ; GCN-NEXT: v_fma_f32 v32, s4, v3, -v134
- ; GCN-NEXT: v_exp_f32_e32 v34, v1
- ; GCN-NEXT: v_perm_b32 v26, v43, v41, s8
- ; GCN-NEXT: v_perm_b32 v27, v61, v45, s8
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[20:21], v[24:25], v[64:79]
- ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v0
- ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1152
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[16:19], v57 offset:1728
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mul_f32_e32 v21, 0x3fb8aa3b, v38
- ; GCN-NEXT: v_exp_f32_e32 v150, v20
- ; GCN-NEXT: v_perm_b32 v20, v42, v40, s8
- ; GCN-NEXT: v_cvt_f16_f32_e32 v40, v148
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[28:29], v[24:25], v[80:95]
- ; GCN-NEXT: v_exp_f32_e32 v38, v21
- ; GCN-NEXT: v_cvt_f16_f32_e32 v28, v39
- ; GCN-NEXT: v_fma_f32 v29, s4, v5, -v134
- ; GCN-NEXT: v_perm_b32 v5, v60, v44, s5
- ; GCN-NEXT: v_perm_b32 v21, v60, v44, s8
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: buffer_wbl2 sc0 sc1
- ; GCN-NEXT: ds_write_b64 v135, v[4:5]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[24:25], v[96:111]
- ; GCN-NEXT: v_perm_b32 v0, v43, v41, s5
- ; GCN-NEXT: v_fma_f32 v41, s4, v6, -v134
- ; GCN-NEXT: v_cvt_f16_f32_e32 v6, v34
- ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v131
- ; GCN-NEXT: v_exp_f32_e32 v42, v1
- ; GCN-NEXT: v_perm_b32 v1, v61, v45, s5
+ ; GCN-NEXT: ds_write_b64 v199, v[150:151]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v136, v[20:21]
+ ; GCN-NEXT: ds_write_b64 v200, v[152:153]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[142:143], v[0:15]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v132, v125
+ ; GCN-NEXT: v_exp_f32_e32 v130, v158
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v137, v[0:1]
+ ; GCN-NEXT: ds_write_b64 v201, v[154:155]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v138, v[26:27]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[16:17], v[24:25], v[112:127]
- ; GCN-NEXT: v_pack_b32_f16 v17, v40, v6
- ; GCN-NEXT: v_mul_f32_e32 v6, 0x3fb8aa3b, v32
+ ; GCN-NEXT: ds_write_b64 v202, v[156:157]
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: v_pack_b32_f16 v16, v37, v28
- ; GCN-NEXT: v_fma_f32 v24, s4, v7, -v134
- ; GCN-NEXT: v_exp_f32_e32 v25, v6
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_read_b128 v[4:7], v139
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79]
- ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v149
- ; GCN-NEXT: v_exp_f32_e32 v26, v0
- ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v29
- ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v150
- ; GCN-NEXT: v_cvt_f16_f32_e32 v27, v38
- ; GCN-NEXT: ds_read_b128 v[20:23], v139 offset:576
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_fma_f32 v28, s4, v9, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[30:31], v[16:17], v[80:95]
- ; GCN-NEXT: v_exp_f32_e32 v29, v0
- ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v41
- ; GCN-NEXT: v_fma_f32 v30, s4, v10, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[2:3], v[16:17], v[96:111]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v2, v42
- ; GCN-NEXT: v_exp_f32_e32 v31, v0
- ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[18:19], v[16:17], v[112:127]
- ; GCN-NEXT: v_pack_b32_f16 v17, v2, v0
- ; GCN-NEXT: v_pack_b32_f16 v16, v1, v27
- ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v24
- ; GCN-NEXT: v_fma_f32 v18, s4, v11, -v134
- ; GCN-NEXT: v_exp_f32_e32 v19, v0
- ; GCN-NEXT: ds_read_b128 v[0:3], v139 offset:1152
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[16:17], v[64:79]
- ; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v8
- ; GCN-NEXT: ds_read_b128 v[8:11], v139 offset:1728
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_exp_f32_e32 v24, v4
- ; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v28
- ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v26
- ; GCN-NEXT: v_exp_f32_e32 v27, v4
- ; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v18
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[20:21], v[16:17], v[80:95]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v29
- ; GCN-NEXT: v_fma_f32 v21, s4, v13, -v134
- ; GCN-NEXT: v_fma_f32 v28, s4, v14, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[16:17], v[96:111]
- ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v30
- ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v31
- ; GCN-NEXT: v_exp_f32_e32 v30, v0
- ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19
- ; GCN-NEXT: v_pack_b32_f16 v1, v1, v0
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[8:9], v[16:17], v[112:127]
- ; GCN-NEXT: v_exp_f32_e32 v16, v4
- ; GCN-NEXT: v_pack_b32_f16 v0, v5, v20
- ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v12
- ; GCN-NEXT: v_exp_f32_e32 v18, v9
- ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v21
- ; GCN-NEXT: v_exp_f32_e32 v21, v9
- ; GCN-NEXT: v_fma_f32 v8, s4, v15, -v134
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79]
- ; GCN-NEXT: ds_read_b128 v[4:7], v57
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[12:15], v57 offset:576
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v24
- ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v27
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[22:23], v[0:1], v[80:95]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v21
- ; GCN-NEXT: v_cvt_f16_f32_e32 v23, v18
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[2:3], v[0:1], v[96:111]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v3, v30
- ; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v28
- ; GCN-NEXT: v_exp_f32_e32 v2, v2
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[10:11], v[0:1], v[112:127]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16
- ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v8
- ; GCN-NEXT: v_exp_f32_e32 v10, v1
- ; GCN-NEXT: v_pack_b32_f16 v8, v17, v20
- ; GCN-NEXT: v_pack_b32_f16 v9, v3, v0
- ; GCN-NEXT: v_add_f32_e32 v3, 0, v49
- ; GCN-NEXT: v_add_f32_e32 v3, v50, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v51, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v52, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v53, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v54, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v55, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v56, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v58, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v163, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v164, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v59, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v160, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v162, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v151, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v153, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v165, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v161, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v159, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v152, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v154, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v155, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v157, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v146, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v147, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v143, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v156, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v129, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v142, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v63, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v158, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v128, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v167, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v130, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v140, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v144, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v132, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v62, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v145, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v35, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v46, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v47, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v141, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v33, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v36, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v39, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v148, v3
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[12:13], v[8:9], v[80:95]
- ; GCN-NEXT: v_add_f32_e32 v3, v34, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v150, v3
- ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v10
- ; GCN-NEXT: v_cvt_f16_f32_e32 v11, v2
- ; GCN-NEXT: v_add_f32_e32 v3, v38, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v42, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v25, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v26, v3
- ; GCN-NEXT: v_pack_b32_f16 v1, v11, v1
- ; GCN-NEXT: v_pack_b32_f16 v0, v23, v22
- ; GCN-NEXT: v_add_f32_e32 v3, v29, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v31, v3
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[14:15], v[0:1], v[80:95]
- ; GCN-NEXT: v_add_f32_e32 v3, v19, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v24, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v27, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v30, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v16, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v18, v3
- ; GCN-NEXT: v_add_f32_e32 v3, v21, v3
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[8:9], v[64:79]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79]
- ; GCN-NEXT: v_add_f32_e32 v0, v2, v3
- ; GCN-NEXT: v_add_f32_e32 v4, v10, v0
- ; GCN-NEXT: ds_bpermute_b32 v5, v133, v4
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1152
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[142:143], v[32:47]
+ ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v68
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v126
+ ; GCN-NEXT: v_exp_f32_e32 v131, v144
+ ; GCN-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v69
+ ; GCN-NEXT: v_fma_f32 v69, s4, v71, -v128
+ ; GCN-NEXT: v_pack_b32_f16 v140, v132, v68
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v129
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[64:65], v[142:143], v[16:31]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v127
+ ; GCN-NEXT: v_exp_f32_e32 v132, v145
+ ; GCN-NEXT: v_fma_f32 v65, s4, v70, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v65
+ ; GCN-NEXT: v_fma_f32 v145, s4, v73, -v128
+ ; GCN-NEXT: v_mul_f32_e32 v147, 0x3fb8aa3b, v145
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[136:137], v[142:143], v[48:63]
+ ; GCN-NEXT: v_exp_f32_e32 v133, v141
+ ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v69
+ ; GCN-NEXT: v_pack_b32_f16 v141, v64, v68
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: ds_read_b128 v[68:71], v198
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_fma_f32 v143, s4, v72, -v128
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v130
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[134:135], v[140:141], v[0:15]
+ ; GCN-NEXT: v_exp_f32_e32 v72, v146
+ ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v143
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v143, v131
+ ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_pack_b32_f16 v64, v64, v143
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[140:141], v[32:47]
+ ; GCN-NEXT: v_exp_f32_e32 v73, v144
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[66:67], v[140:141], v[16:31]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v132
+ ; GCN-NEXT: v_fma_f32 v67, s4, v74, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v74, v65
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v65, v133
+ ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v67
+ ; GCN-NEXT: v_pack_b32_f16 v65, v66, v65
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[140:141], v[48:63]
+ ; GCN-NEXT: v_fma_f32 v138, s4, v75, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v75, v142
+ ; GCN-NEXT: v_mul_f32_e32 v148, 0x3fb8aa3b, v138
+ ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:1728
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v72
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[64:65], v[0:15]
+ ; GCN-NEXT: v_fma_f32 v68, s4, v76, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v76, v146
+ ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v68
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v73
+ ; GCN-NEXT: v_fma_f32 v69, s4, v77, -v128
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[134:135], v[64:65], v[32:47]
+ ; GCN-NEXT: v_exp_f32_e32 v77, v147
+ ; GCN-NEXT: v_pack_b32_f16 v134, v66, v68
+ ; GCN-NEXT: v_fma_f32 v68, s4, v78, -v128
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v74
+ ; GCN-NEXT: v_mul_f32_e32 v147, 0x3fb8aa3b, v69
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[138:139], v[64:65], v[16:31]
+ ; GCN-NEXT: v_exp_f32_e32 v78, v67
+ ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v68
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v139, v76
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[142:143], v[64:65], v[48:63]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v75
+ ; GCN-NEXT: v_fma_f32 v65, s4, v79, -v128
+ ; GCN-NEXT: v_exp_f32_e32 v79, v148
+ ; GCN-NEXT: v_mul_f32_e32 v128, 0x3fb8aa3b, v65
+ ; GCN-NEXT: v_pack_b32_f16 v135, v66, v64
+ ; GCN-NEXT: s_nop 1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[70:71], v[134:135], v[0:15]
+ ; GCN-NEXT: v_exp_f32_e32 v142, v146
+ ; GCN-NEXT: ds_read_b128 v[68:71], v197
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[64:67], v197 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[136:137], v[134:135], v[32:47]
+ ; GCN-NEXT: v_exp_f32_e32 v137, v147
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v77
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[140:141], v[134:135], v[16:31]
+ ; GCN-NEXT: v_exp_f32_e32 v138, v138
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v140, v78
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[144:145], v[134:135], v[48:63]
+ ; GCN-NEXT: s_nop 10
+ ; GCN-NEXT: v_exp_f32_e32 v52, v128
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v50, v137
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v51, v142
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v54, v138
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v53, v52
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v49, v79
+ ; GCN-NEXT: v_pack_b32_f16 v50, v51, v50
+ ; GCN-NEXT: v_pack_b32_f16 v48, v139, v136
+ ; GCN-NEXT: v_pack_b32_f16 v51, v54, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, 0, v113
+ ; GCN-NEXT: v_add_f32_e32 v53, v114, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v115, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v116, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v117, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v118, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v119, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v120, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v121, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v122, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v123, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v124, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v96, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v97, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v98, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v99, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v100, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v101, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v102, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v103, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v104, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v105, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v106, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v107, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v108, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v109, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v110, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v111, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v80, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v81, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v82, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v83, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v84, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v85, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v86, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v87, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v88, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v89, v53
+ ; GCN-NEXT: v_pack_b32_f16 v49, v140, v49
+ ; GCN-NEXT: v_add_f32_e32 v53, v90, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v91, v53
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[48:49], v[0:15]
+ ; GCN-NEXT: v_add_f32_e32 v53, v92, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v93, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v94, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v95, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v125, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v126, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v127, v53
+ ; GCN-NEXT: v_add_f32_e32 v53, v129, v53
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[70:71], v[50:51], v[0:15]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[64:65], v[48:49], v[32:47]
+ ; GCN-NEXT: s_nop 9
+ ; GCN-NEXT: v_add_f32_e32 v0, v130, v53
+ ; GCN-NEXT: v_add_f32_e32 v0, v131, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v132, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v133, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v72, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v73, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v74, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v75, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v76, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v77, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v78, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v79, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v142, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v137, v0
+ ; GCN-NEXT: v_add_f32_e32 v0, v138, v0
+ ; GCN-NEXT: v_add_f32_e32 v4, v52, v0
+ ; GCN-NEXT: ds_bpermute_b32 v5, v196, v4
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: ds_read_b128 v[0:3], v197 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[0:1], v[48:49], v[16:31]
; GCN-NEXT: v_add_f32_e32 v2, v4, v5
- ; GCN-NEXT: ds_bpermute_b32 v3, v133, v2
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[8:9], v[96:111]
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[6:7]
+ ; GCN-NEXT: ds_bpermute_b32 v3, v196, v2
; GCN-NEXT: ; implicit-def: $vgpr4
- ; GCN-NEXT: v_fmac_f32_e32 v0, v4, v48
- ; GCN-NEXT: ds_read_b128 v[0:3], v57 offset:1728
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[12:13]
+ ; GCN-NEXT: v_fmac_f32_e32 v0, v4, v112
+ ; GCN-NEXT: ds_read_b128 v[0:3], v197 offset:1728
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[66:67], v[50:51], v[32:47]
; GCN-NEXT: s_endpgm
attributes #0 = {"amdgpu-flat-work-group-size"="256,256"}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
index 7959cee..e174fc1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
@@ -156,62 +156,62 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias
; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0
; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0
; GCN-NEXT: v_mov_b32_e32 v2, 1.0
-; GCN-NEXT: v_mov_b32_e32 v3, 2.0
+; GCN-NEXT: v_mov_b32_e32 v1, 2.0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_add_u32_e32 v1, s0, v0
-; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112
-; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:96
-; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80
-; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:64
-; GCN-NEXT: ds_read_b128 a[0:3], v1
-; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:16
-; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:32
-; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:48
+; GCN-NEXT: v_add_u32_e32 v3, s0, v0
+; GCN-NEXT: ds_read_b128 a[28:31], v3 offset:112
+; GCN-NEXT: ds_read_b128 a[24:27], v3 offset:96
+; GCN-NEXT: ds_read_b128 a[20:23], v3 offset:80
+; GCN-NEXT: ds_read_b128 a[16:19], v3 offset:64
+; GCN-NEXT: ds_read_b128 a[0:3], v3
+; GCN-NEXT: ds_read_b128 a[4:7], v3 offset:16
+; GCN-NEXT: ds_read_b128 a[8:11], v3 offset:32
+; GCN-NEXT: ds_read_b128 a[12:15], v3 offset:48
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:8304
-; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:8288
-; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:8272
-; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:8256
-; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:8240
-; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:8224
-; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:8208
-; GCN-NEXT: ds_read_b128 a[128:131], v1 offset:8192
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
+; GCN-NEXT: ds_read_b128 a[156:159], v3 offset:8304
+; GCN-NEXT: ds_read_b128 a[152:155], v3 offset:8288
+; GCN-NEXT: ds_read_b128 a[148:151], v3 offset:8272
+; GCN-NEXT: ds_read_b128 a[144:147], v3 offset:8256
+; GCN-NEXT: ds_read_b128 a[140:143], v3 offset:8240
+; GCN-NEXT: ds_read_b128 a[136:139], v3 offset:8224
+; GCN-NEXT: ds_read_b128 a[132:135], v3 offset:8208
+; GCN-NEXT: ds_read_b128 a[128:131], v3 offset:8192
+; GCN-NEXT: v_add_u32_e32 v4, 0x6000, v3
; GCN-NEXT: v_add_u32_e32 v0, s1, v0
; GCN-NEXT: ; iglp_opt mask(0x00000001)
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159]
-; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:24688
-; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:24672
-; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:24656
-; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:24640
-; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:24624
-; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:24608
-; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:24592
-; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:24576
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159]
+; GCN-NEXT: ds_read_b128 a[124:127], v3 offset:24688
+; GCN-NEXT: ds_read_b128 a[120:123], v3 offset:24672
+; GCN-NEXT: ds_read_b128 a[116:119], v3 offset:24656
+; GCN-NEXT: ds_read_b128 a[112:115], v3 offset:24640
+; GCN-NEXT: ds_read_b128 a[108:111], v3 offset:24624
+; GCN-NEXT: ds_read_b128 a[104:107], v3 offset:24608
+; GCN-NEXT: ds_read_b128 a[100:103], v3 offset:24592
+; GCN-NEXT: ds_read_b128 a[96:99], v3 offset:24576
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v3, a[96:127]
-; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:49264
-; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:49248
-; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:49232
-; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:49216
-; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:49200
-; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:49184
-; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:49168
-; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:49152
-; GCN-NEXT: v_add_u32_e32 v1, 0x6000, v1
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127]
+; GCN-NEXT: ds_read_b128 a[92:95], v3 offset:49264
+; GCN-NEXT: ds_read_b128 a[88:91], v3 offset:49248
+; GCN-NEXT: ds_read_b128 a[84:87], v3 offset:49232
+; GCN-NEXT: ds_read_b128 a[80:83], v3 offset:49216
+; GCN-NEXT: ds_read_b128 a[76:79], v3 offset:49200
+; GCN-NEXT: ds_read_b128 a[72:75], v3 offset:49184
+; GCN-NEXT: ds_read_b128 a[68:71], v3 offset:49168
+; GCN-NEXT: ds_read_b128 a[64:67], v3 offset:49152
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95]
-; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:57456
-; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:57440
-; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:57424
-; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:57408
-; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:57344
-; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:57360
-; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:57376
-; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:57392
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95]
+; GCN-NEXT: ds_read_b128 a[60:63], v4 offset:57456
+; GCN-NEXT: ds_read_b128 a[56:59], v4 offset:57440
+; GCN-NEXT: ds_read_b128 a[52:55], v4 offset:57424
+; GCN-NEXT: ds_read_b128 a[48:51], v4 offset:57408
+; GCN-NEXT: ds_read_b128 a[32:35], v4 offset:57344
+; GCN-NEXT: ds_read_b128 a[36:39], v4 offset:57360
+; GCN-NEXT: ds_read_b128 a[40:43], v4 offset:57376
+; GCN-NEXT: ds_read_b128 a[44:47], v4 offset:57392
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63]
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63]
; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112
; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96
; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll
index a2c1545..447a5f2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll
@@ -361,12 +361,10 @@ define amdgpu_kernel void @raw_atomic_buffer_load_v4i16(<4 x i32> %addr) {
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-TRUE16-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX11-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB7_1
; GFX11-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
@@ -444,12 +442,10 @@ define amdgpu_kernel void @raw_atomic_buffer_load_v4i16(<4 x i32> %addr) {
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, v3, 16, v1
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
; GFX12-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB7_1
; GFX12-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll
index 6f7c001..2e0e420 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll
@@ -361,12 +361,10 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) %pt
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-TRUE16-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX11-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB7_1
; GFX11-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
@@ -444,12 +442,10 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) %pt
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, v3, 16, v1
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
; GFX12-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB7_1
; GFX12-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
index aa099b6..b65a1a8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
@@ -623,62 +623,62 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0
; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 1.0
+; GCN-NEXT: v_mov_b32_e32 v1, 2.0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_add_u32_e32 v1, s0, v0
-; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:112
-; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:96
-; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:80
-; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:64
-; GCN-NEXT: ds_read_b128 a[128:131], v1
-; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:16
-; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:32
-; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:48
-; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:8304
-; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:8288
-; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:8272
-; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:8256
-; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:8240
-; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:8224
-; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:8208
-; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:8192
-; GCN-NEXT: v_add_u32_e32 v2, 0x6000, v1
-; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:24688
-; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:24672
-; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:24656
-; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:24640
-; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:24624
-; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:24608
-; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:24592
-; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:24576
-; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:49264
-; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:49248
-; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:49232
-; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:49216
-; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:49200
-; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:49184
-; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:49168
-; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:49152
-; GCN-NEXT: v_mov_b32_e32 v1, 1.0
-; GCN-NEXT: ds_read_b128 a[60:63], v2 offset:57456
-; GCN-NEXT: ds_read_b128 a[56:59], v2 offset:57440
-; GCN-NEXT: ds_read_b128 a[52:55], v2 offset:57424
-; GCN-NEXT: ds_read_b128 a[48:51], v2 offset:57408
-; GCN-NEXT: ds_read_b128 a[32:35], v2 offset:57344
-; GCN-NEXT: ds_read_b128 a[36:39], v2 offset:57360
-; GCN-NEXT: ds_read_b128 a[40:43], v2 offset:57376
-; GCN-NEXT: ds_read_b128 a[44:47], v2 offset:57392
-; GCN-NEXT: v_mov_b32_e32 v2, 2.0
+; GCN-NEXT: v_add_u32_e32 v3, s0, v0
+; GCN-NEXT: ds_read_b128 a[156:159], v3 offset:112
+; GCN-NEXT: ds_read_b128 a[152:155], v3 offset:96
+; GCN-NEXT: ds_read_b128 a[148:151], v3 offset:80
+; GCN-NEXT: ds_read_b128 a[144:147], v3 offset:64
+; GCN-NEXT: ds_read_b128 a[128:131], v3
+; GCN-NEXT: ds_read_b128 a[132:135], v3 offset:16
+; GCN-NEXT: ds_read_b128 a[136:139], v3 offset:32
+; GCN-NEXT: ds_read_b128 a[140:143], v3 offset:48
+; GCN-NEXT: v_add_u32_e32 v4, 0x6000, v3
+; GCN-NEXT: ds_read_b128 a[28:31], v3 offset:8304
+; GCN-NEXT: ds_read_b128 a[24:27], v3 offset:8288
+; GCN-NEXT: ds_read_b128 a[20:23], v3 offset:8272
+; GCN-NEXT: ds_read_b128 a[16:19], v3 offset:8256
+; GCN-NEXT: ds_read_b128 a[12:15], v3 offset:8240
+; GCN-NEXT: ds_read_b128 a[8:11], v3 offset:8224
+; GCN-NEXT: ds_read_b128 a[4:7], v3 offset:8208
+; GCN-NEXT: ds_read_b128 a[0:3], v3 offset:8192
+; GCN-NEXT: ds_read_b128 a[124:127], v3 offset:24688
+; GCN-NEXT: ds_read_b128 a[120:123], v3 offset:24672
+; GCN-NEXT: ds_read_b128 a[116:119], v3 offset:24656
+; GCN-NEXT: ds_read_b128 a[112:115], v3 offset:24640
+; GCN-NEXT: ds_read_b128 a[108:111], v3 offset:24624
+; GCN-NEXT: ds_read_b128 a[104:107], v3 offset:24608
+; GCN-NEXT: ds_read_b128 a[100:103], v3 offset:24592
+; GCN-NEXT: ds_read_b128 a[96:99], v3 offset:24576
+; GCN-NEXT: ds_read_b128 a[92:95], v3 offset:49264
+; GCN-NEXT: ds_read_b128 a[88:91], v3 offset:49248
+; GCN-NEXT: ds_read_b128 a[84:87], v3 offset:49232
+; GCN-NEXT: ds_read_b128 a[80:83], v3 offset:49216
+; GCN-NEXT: ds_read_b128 a[76:79], v3 offset:49200
+; GCN-NEXT: ds_read_b128 a[72:75], v3 offset:49184
+; GCN-NEXT: ds_read_b128 a[68:71], v3 offset:49168
+; GCN-NEXT: ds_read_b128 a[64:67], v3 offset:49152
+; GCN-NEXT: ds_read_b128 a[60:63], v4 offset:57456
+; GCN-NEXT: ds_read_b128 a[56:59], v4 offset:57440
+; GCN-NEXT: ds_read_b128 a[52:55], v4 offset:57424
+; GCN-NEXT: ds_read_b128 a[48:51], v4 offset:57408
+; GCN-NEXT: ds_read_b128 a[32:35], v4 offset:57344
+; GCN-NEXT: ds_read_b128 a[36:39], v4 offset:57360
+; GCN-NEXT: ds_read_b128 a[40:43], v4 offset:57376
+; GCN-NEXT: ds_read_b128 a[44:47], v4 offset:57392
+; GCN-NEXT: s_waitcnt lgkmcnt(14)
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159]
; GCN-NEXT: v_add_u32_e32 v0, s1, v0
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0)
-; GCN-NEXT: s_waitcnt lgkmcnt(14)
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v1, v2, a[128:159]
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v1, v2, a[96:127]
; GCN-NEXT: s_waitcnt lgkmcnt(8)
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95]
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95]
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127]
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63]
-; GCN-NEXT: s_nop 12
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63]
+; GCN-NEXT: s_nop 11
; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:112
; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:96
; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:80
@@ -729,62 +729,62 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0
; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x1ff80, v0
+; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 1.0
+; EXACTCUTOFF-NEXT: v_mov_b32_e32 v1, 2.0
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s0, v0
-; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v1 offset:112
-; EXACTCUTOFF-NEXT: ds_read_b128 a[152:155], v1 offset:96
-; EXACTCUTOFF-NEXT: ds_read_b128 a[148:151], v1 offset:80
-; EXACTCUTOFF-NEXT: ds_read_b128 a[144:147], v1 offset:64
-; EXACTCUTOFF-NEXT: ds_read_b128 a[128:131], v1
-; EXACTCUTOFF-NEXT: ds_read_b128 a[132:135], v1 offset:16
-; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v1 offset:32
-; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v1 offset:48
-; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:8304
-; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:8288
-; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:8272
-; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:8256
-; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:8240
-; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:8224
-; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:8208
-; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:8192
-; EXACTCUTOFF-NEXT: v_add_u32_e32 v2, 0x6000, v1
-; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v1 offset:24688
-; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v1 offset:24672
-; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v1 offset:24656
-; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v1 offset:24640
-; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v1 offset:24624
-; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v1 offset:24608
-; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v1 offset:24592
-; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v1 offset:24576
-; EXACTCUTOFF-NEXT: ds_read_b128 a[92:95], v1 offset:49264
-; EXACTCUTOFF-NEXT: ds_read_b128 a[88:91], v1 offset:49248
-; EXACTCUTOFF-NEXT: ds_read_b128 a[84:87], v1 offset:49232
-; EXACTCUTOFF-NEXT: ds_read_b128 a[80:83], v1 offset:49216
-; EXACTCUTOFF-NEXT: ds_read_b128 a[76:79], v1 offset:49200
-; EXACTCUTOFF-NEXT: ds_read_b128 a[72:75], v1 offset:49184
-; EXACTCUTOFF-NEXT: ds_read_b128 a[68:71], v1 offset:49168
-; EXACTCUTOFF-NEXT: ds_read_b128 a[64:67], v1 offset:49152
-; EXACTCUTOFF-NEXT: v_mov_b32_e32 v1, 1.0
-; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v2 offset:57456
-; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v2 offset:57440
-; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v2 offset:57424
-; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v2 offset:57408
-; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v2 offset:57344
-; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v2 offset:57360
-; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v2 offset:57376
-; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v2 offset:57392
-; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 2.0
+; EXACTCUTOFF-NEXT: v_add_u32_e32 v3, s0, v0
+; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v3 offset:112
+; EXACTCUTOFF-NEXT: ds_read_b128 a[152:155], v3 offset:96
+; EXACTCUTOFF-NEXT: ds_read_b128 a[148:151], v3 offset:80
+; EXACTCUTOFF-NEXT: ds_read_b128 a[144:147], v3 offset:64
+; EXACTCUTOFF-NEXT: ds_read_b128 a[128:131], v3
+; EXACTCUTOFF-NEXT: ds_read_b128 a[132:135], v3 offset:16
+; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v3 offset:32
+; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v3 offset:48
+; EXACTCUTOFF-NEXT: v_add_u32_e32 v4, 0x6000, v3
+; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v3 offset:8304
+; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v3 offset:8288
+; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v3 offset:8272
+; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v3 offset:8256
+; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v3 offset:8240
+; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v3 offset:8224
+; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v3 offset:8208
+; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v3 offset:8192
+; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v3 offset:24688
+; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v3 offset:24672
+; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v3 offset:24656
+; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v3 offset:24640
+; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v3 offset:24624
+; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v3 offset:24608
+; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v3 offset:24592
+; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v3 offset:24576
+; EXACTCUTOFF-NEXT: ds_read_b128 a[92:95], v3 offset:49264
+; EXACTCUTOFF-NEXT: ds_read_b128 a[88:91], v3 offset:49248
+; EXACTCUTOFF-NEXT: ds_read_b128 a[84:87], v3 offset:49232
+; EXACTCUTOFF-NEXT: ds_read_b128 a[80:83], v3 offset:49216
+; EXACTCUTOFF-NEXT: ds_read_b128 a[76:79], v3 offset:49200
+; EXACTCUTOFF-NEXT: ds_read_b128 a[72:75], v3 offset:49184
+; EXACTCUTOFF-NEXT: ds_read_b128 a[68:71], v3 offset:49168
+; EXACTCUTOFF-NEXT: ds_read_b128 a[64:67], v3 offset:49152
+; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v4 offset:57456
+; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v4 offset:57440
+; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v4 offset:57424
+; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v4 offset:57408
+; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v4 offset:57344
+; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v4 offset:57360
+; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v4 offset:57376
+; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v4 offset:57392
+; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(14)
+; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159]
; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v0
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0)
-; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(14)
-; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v1, v2, a[128:159]
-; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v1, v2, a[96:127]
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(8)
-; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95]
+; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95]
+; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127]
+; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63]
-; EXACTCUTOFF-NEXT: s_nop 12
+; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63]
+; EXACTCUTOFF-NEXT: s_nop 11
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[156:159] offset:112
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[152:155] offset:96
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[148:151] offset:80
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
index 8896364..ebb33684 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
@@ -455,12 +455,10 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i16(<4 x i32> %addr, i32
; GFX11-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX11-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB8_1
; GFX11-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
@@ -550,12 +548,10 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i16(<4 x i32> %addr, i32
; GFX12-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT
; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX12-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
; GFX12-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB8_1
; GFX12-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
index 23db247..40be567 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
@@ -455,12 +455,10 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i16(ptr addrspace(8)
; GFX11-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX11-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB8_1
; GFX11-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
@@ -550,12 +548,10 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i16(ptr addrspace(8)
; GFX12-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT
; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX12-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
; GFX12-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB8_1
; GFX12-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
index 92a2f54..068a989 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
@@ -1053,19 +1053,15 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX11-TRUE16-LABEL: s_maximum_v2f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s2, s0, s1
; GFX11-TRUE16-NEXT: v_pk_max_f16 v0, s0, s1
-; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s0, 16
; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, s0, s1
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s1, s3, s2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, s0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v1.l, s1
; GFX11-TRUE16-NEXT: ;;#ASMSTART
; GFX11-TRUE16-NEXT: ; use v0
; GFX11-TRUE16-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll b/llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll
index 63e9eef..66b7958 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll
@@ -315,7 +315,7 @@ define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align16(ptr ad
; FUNC-LABEL: {{^}}test_memcpy_const_string_align4:
; SI: s_getpc_b64
-; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, hello.align4@rel32@lo+4
+; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, .Lhello.align4@rel32@lo+4
; SI: s_addc_u32
; SI-DAG: s_load_dwordx8
; SI-DAG: s_load_dwordx2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
index 9e82b41..2482d10 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
@@ -866,19 +866,15 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX11-TRUE16-LABEL: s_minimum_v2f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s2, s0, s1
; GFX11-TRUE16-NEXT: v_pk_min_f16 v0, s0, s1
-; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s0, 16
; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, s0, s1
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s1, s3, s2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, s0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v1.l, s1
; GFX11-TRUE16-NEXT: ;;#ASMSTART
; GFX11-TRUE16-NEXT: ; use v0
; GFX11-TRUE16-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll
index dcf01f7..818dff4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll
@@ -63,14 +63,10 @@ define amdgpu_kernel void @sqrt_v2bf16(ptr addrspace(1) %r, ptr addrspace(1) %a)
; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], null
; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_sqrt_bf16_e32 v1.l, v0.l
-; GFX12-TRUE16-NEXT: v_nop
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX12-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_sqrt_bf16_e32 v0.h, v1.l
; GFX12-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX12-TRUE16-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index b534c2c..6f63384 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -9604,11 +9604,11 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v1
; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v2
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v2
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
; GFX12-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-TRUE16-NEXT: s_endpgm
;
@@ -9738,11 +9738,11 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_bfe_i32 v2, v1, 0, 16
; GFX12-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v2
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v2
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
; GFX12-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-TRUE16-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll
index 56f9c5d..d578d2e 100644
--- a/llvm/test/CodeGen/AMDGPU/minmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/minmax.ll
@@ -612,10 +612,10 @@ define void @test_med3_f32(ptr addrspace(1) %arg, float %x, float %y, float %z)
; GFX1250-NEXT: v_med3_num_f32 v2, v2, v3, v4
; GFX1250-NEXT: global_store_b32 v[0:1], v2, off
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
- %tmp0 = call float @llvm.minnum.f32(float %x, float %y)
- %tmp1 = call float @llvm.maxnum.f32(float %x, float %y)
- %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %z)
- %tmp3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+ %tmp0 = call nnan float @llvm.minnum.f32(float %x, float %y)
+ %tmp1 = call nnan float @llvm.maxnum.f32(float %x, float %y)
+ %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %z)
+ %tmp3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
store float %tmp3, ptr addrspace(1) %arg
ret void
}
@@ -646,10 +646,10 @@ define void @test_med3_minimumnum_maximumnum_f32(ptr addrspace(1) %arg, float %x
; GFX1250-NEXT: v_med3_num_f32 v2, v2, v3, v4
; GFX1250-NEXT: global_store_b32 v[0:1], v2, off
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
- %tmp0 = call float @llvm.minimumnum.f32(float %x, float %y)
- %tmp1 = call float @llvm.maximumnum.f32(float %x, float %y)
- %tmp2 = call float @llvm.minimumnum.f32(float %tmp1, float %z)
- %tmp3 = call float @llvm.maximumnum.f32(float %tmp0, float %tmp2)
+ %tmp0 = call nnan float @llvm.minimumnum.f32(float %x, float %y)
+ %tmp1 = call nnan float @llvm.maximumnum.f32(float %x, float %y)
+ %tmp2 = call nnan float @llvm.minimumnum.f32(float %tmp1, float %z)
+ %tmp3 = call nnan float @llvm.maximumnum.f32(float %tmp0, float %tmp2)
store float %tmp3, ptr addrspace(1) %arg
ret void
}
@@ -1280,10 +1280,10 @@ define void @test_med3_f16(ptr addrspace(1) %arg, half %x, half %y, half %z) #0
; GISEL-GFX1250-FAKE16-NEXT: v_med3_num_f16 v2, v2, v3, v4
; GISEL-GFX1250-FAKE16-NEXT: global_store_b16 v[0:1], v2, off
; GISEL-GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
- %tmp0 = call half @llvm.minnum.f16(half %x, half %y)
- %tmp1 = call half @llvm.maxnum.f16(half %x, half %y)
- %tmp2 = call half @llvm.minnum.f16(half %tmp1, half %z)
- %tmp3 = call half @llvm.maxnum.f16(half %tmp0, half %tmp2)
+ %tmp0 = call nnan half @llvm.minnum.f16(half %x, half %y)
+ %tmp1 = call nnan half @llvm.maxnum.f16(half %x, half %y)
+ %tmp2 = call nnan half @llvm.minnum.f16(half %tmp1, half %z)
+ %tmp3 = call nnan half @llvm.maxnum.f16(half %tmp0, half %tmp2)
store half %tmp3, ptr addrspace(1) %arg
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/AMDGPU/naked-fn-with-frame-pointer.ll
index 5ff2d82..2509497 100644
--- a/llvm/test/CodeGen/AMDGPU/naked-fn-with-frame-pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/naked-fn-with-frame-pointer.ll
@@ -5,8 +5,8 @@ declare dso_local void @main()
define dso_local void @naked() naked "frame-pointer"="all" {
; CHECK-LABEL: naked:
-; CHECK: naked$local:
-; CHECK-NEXT: .type naked$local,@function
+; CHECK: .Lnaked$local:
+; CHECK-NEXT: .type .Lnaked$local,@function
; CHECK-NEXT: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_getpc_b64 s[16:17]
@@ -19,8 +19,8 @@ define dso_local void @naked() naked "frame-pointer"="all" {
define dso_local void @normal() "frame-pointer"="all" {
; CHECK-LABEL: normal:
-; CHECK: normal$local:
-; CHECK-NEXT: .type normal$local,@function
+; CHECK: .Lnormal$local:
+; CHECK-NEXT: .type .Lnormal$local,@function
; CHECK-NEXT: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 s16, s33
diff --git a/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll b/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll
index ddbae64..a95d8c7 100644
--- a/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll
+++ b/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll
@@ -1,8 +1,8 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck %s -check-prefixes=GCN,GFX700
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GCN,GFX900
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GCN,GFX900
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GCN,GFX900
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GCN,GFX1100
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GCN,GFX1100
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GCN,GFX1100
declare i64 @llvm.readsteadycounter() #0
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
index 9a23788..8803f3a 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
@@ -367,77 +367,76 @@ bb:
define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 {
; CHECK-LABEL: illegal_mfma_after_rewrite:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: s_mov_b32 s0, 0
-; CHECK-NEXT: s_mov_b32 s1, s0
-; CHECK-NEXT: v_mov_b64_e32 v[28:29], s[0:1]
+; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: s_mov_b32 s5, s4
+; CHECK-NEXT: v_mov_b64_e32 v[26:27], s[4:5]
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[0:3]
; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[16:19]
+; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; CHECK-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; CHECK-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; CHECK-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; CHECK-NEXT: s_mov_b32 s0, 0x3c003c00
; CHECK-NEXT: s_mov_b32 s1, s0
-; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[0:1]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[26:27], v[26:27], v[0:3]
+; CHECK-NEXT: v_mov_b64_e32 v[28:29], s[0:1]
; CHECK-NEXT: s_mov_b32 s0, 0x7e007e00
; CHECK-NEXT: s_mov_b32 s1, s0
-; CHECK-NEXT: v_accvgpr_write_b32 a0, s0
-; CHECK-NEXT: v_accvgpr_write_b32 a1, s1
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[28:29], v[28:29], v[4:7]
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[30:31], v[4:7]
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[12:15], v[28:29], a[0:1], v[4:7]
-; CHECK-NEXT: s_nop 2
-; CHECK-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; CHECK-NEXT: v_mov_b32_e32 v5, v4
-; CHECK-NEXT: v_mov_b32_e32 v6, v4
-; CHECK-NEXT: v_mov_b32_e32 v7, v4
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[8:11]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[26:27], v[26:27], v[4:7]
+; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[0:1]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[28:29], v[0:3]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[6:9]
+; CHECK-NEXT: s_nop 3
+; CHECK-NEXT: v_cvt_f16_f32_e32 v24, v4
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[12:15], v[26:27], v[30:31], v[0:3]
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[28:29], v[28:29], v[4:7]
-; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def v[4:7]
-; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[28:29], v[28:29], v[16:19]
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[24:27], v[28:29], v[30:31], v[4:7]
-; CHECK-NEXT: s_nop 5
-; CHECK-NEXT: v_cvt_f16_f32_e32 v17, v8
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[12:15]
-; CHECK-NEXT: s_nop 2
-; CHECK-NEXT: v_mov_b64_e32 v[12:13], 0
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[28:29], v[28:29], v[0:3]
-; CHECK-NEXT: global_store_short v[12:13], v17, off
+; CHECK-NEXT: v_mov_b32_e32 v8, 0x7fc00000
+; CHECK-NEXT: v_mov_b32_e32 v9, v8
+; CHECK-NEXT: v_mov_b32_e32 v10, v8
+; CHECK-NEXT: v_mov_b32_e32 v11, v8
+; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v6
+; CHECK-NEXT: v_mov_b64_e32 v[0:1], 0
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[26:27], v[26:27], v[8:11]
+; CHECK-NEXT: global_store_short v[0:1], v2, off
; CHECK-NEXT: buffer_wbl2 sc0 sc1
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_inv sc0 sc1
-; CHECK-NEXT: v_cvt_f16_f32_e32 v9, v16
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[20:23], v[28:29], v[28:29], v[4:7]
-; CHECK-NEXT: global_store_short v[12:13], v9, off
-; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v8
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[24:27]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[26:27], v[28:29], v[16:19]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[8:11]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[20:23], v[26:27], v[26:27], v[16:19]
+; CHECK-NEXT: s_nop 5
+; CHECK-NEXT: v_cvt_f16_f32_e32 v10, v6
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[12:15]
+; CHECK-NEXT: global_store_short v[0:1], v10, off
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[26:27], v[26:27], v[2:5]
; CHECK-NEXT: buffer_wbl2 sc0 sc1
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_inv sc0 sc1
-; CHECK-NEXT: v_cvt_f16_f32_e32 v14, v0
-; CHECK-NEXT: global_store_short v[12:13], v1, off
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[28:29], v[28:29], v[20:23]
+; CHECK-NEXT: s_nop 1
+; CHECK-NEXT: v_cvt_f16_f32_e32 v6, v6
+; CHECK-NEXT: global_store_short v[0:1], v6, off
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[26:27], v[26:27], v[20:23]
; CHECK-NEXT: buffer_wbl2 sc0 sc1
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_inv sc0 sc1
-; CHECK-NEXT: global_store_short v[12:13], v14, off
+; CHECK-NEXT: global_store_short v[0:1], v24, off
; CHECK-NEXT: buffer_wbl2 sc0 sc1
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_inv sc0 sc1
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[30:31], v[28:29], v[8:11]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[28:29], v[26:27], v[2:5]
; CHECK-NEXT: s_nop 6
-; CHECK-NEXT: v_cvt_f16_f32_e32 v8, v0
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], a[0:1], v[28:29], v[4:7]
-; CHECK-NEXT: global_store_short v[12:13], v8, off
+; CHECK-NEXT: v_cvt_f16_f32_e32 v6, v2
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[30:31], v[26:27], v[16:19]
+; CHECK-NEXT: global_store_short v[0:1], v6, off
; CHECK-NEXT: buffer_wbl2 sc0 sc1
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_inv sc0 sc1
; CHECK-NEXT: s_nop 2
-; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0
-; CHECK-NEXT: global_store_short v[12:13], v0, off
+; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v2
+; CHECK-NEXT: global_store_short v[0:1], v2, off
; CHECK-NEXT: s_endpgm
entry:
%k0 = call <4 x float> asm sideeffect "; def $0", "=s"()
@@ -546,100 +545,14 @@ define void @test_rewrite_mfma_subreg_insert2(double %arg0, double %arg1, ptr ad
define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) #0 {
; CHECK-LABEL: test_rewrite_mfma_direct_copy_from_agpr_class:
; CHECK: ; %bb.0:
+; CHECK-NEXT: v_accvgpr_write_b32 a34, 2.0
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 7, v0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a[0:31]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_accvgpr_write_b32 a32, v0
-; CHECK-NEXT: v_accvgpr_read_b32 v63, a31
-; CHECK-NEXT: v_accvgpr_read_b32 v62, a30
-; CHECK-NEXT: v_accvgpr_read_b32 v61, a29
-; CHECK-NEXT: v_accvgpr_read_b32 v60, a28
-; CHECK-NEXT: v_accvgpr_read_b32 v59, a27
-; CHECK-NEXT: v_accvgpr_read_b32 v58, a26
-; CHECK-NEXT: v_accvgpr_read_b32 v57, a25
-; CHECK-NEXT: v_accvgpr_read_b32 v56, a24
-; CHECK-NEXT: v_accvgpr_read_b32 v55, a23
-; CHECK-NEXT: v_accvgpr_read_b32 v54, a22
-; CHECK-NEXT: v_accvgpr_read_b32 v53, a21
-; CHECK-NEXT: v_accvgpr_read_b32 v52, a20
-; CHECK-NEXT: v_accvgpr_read_b32 v51, a19
-; CHECK-NEXT: v_accvgpr_read_b32 v50, a18
-; CHECK-NEXT: v_accvgpr_read_b32 v49, a17
-; CHECK-NEXT: v_accvgpr_read_b32 v48, a16
-; CHECK-NEXT: v_accvgpr_read_b32 v47, a15
-; CHECK-NEXT: v_accvgpr_read_b32 v46, a14
-; CHECK-NEXT: v_accvgpr_read_b32 v45, a13
-; CHECK-NEXT: v_accvgpr_read_b32 v44, a12
-; CHECK-NEXT: v_accvgpr_read_b32 v43, a11
-; CHECK-NEXT: v_accvgpr_read_b32 v42, a10
-; CHECK-NEXT: v_accvgpr_read_b32 v41, a9
-; CHECK-NEXT: v_accvgpr_read_b32 v40, a8
-; CHECK-NEXT: v_accvgpr_read_b32 v39, a7
-; CHECK-NEXT: v_accvgpr_read_b32 v38, a6
-; CHECK-NEXT: v_accvgpr_read_b32 v37, a5
-; CHECK-NEXT: v_accvgpr_read_b32 v36, a4
-; CHECK-NEXT: v_accvgpr_read_b32 v35, a3
-; CHECK-NEXT: v_accvgpr_read_b32 v34, a2
-; CHECK-NEXT: v_accvgpr_read_b32 v33, a1
-; CHECK-NEXT: v_accvgpr_read_b32 v32, a0
-; CHECK-NEXT: v_accvgpr_write_b32 a0, 2.0
-; CHECK-NEXT: v_accvgpr_write_b32 a1, 4.0
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], a0, a1, v[32:63]
-; CHECK-NEXT: v_accvgpr_write_b32 a0, v32
-; CHECK-NEXT: v_accvgpr_write_b32 a1, v33
-; CHECK-NEXT: v_accvgpr_write_b32 a2, v34
-; CHECK-NEXT: v_accvgpr_write_b32 a3, v35
-; CHECK-NEXT: v_accvgpr_write_b32 a4, v36
-; CHECK-NEXT: v_accvgpr_write_b32 a5, v37
-; CHECK-NEXT: v_accvgpr_write_b32 a6, v38
-; CHECK-NEXT: v_accvgpr_write_b32 a7, v39
-; CHECK-NEXT: v_accvgpr_write_b32 a8, v40
-; CHECK-NEXT: v_accvgpr_write_b32 a9, v41
-; CHECK-NEXT: v_accvgpr_write_b32 a10, v42
-; CHECK-NEXT: v_accvgpr_write_b32 a11, v43
-; CHECK-NEXT: v_accvgpr_write_b32 a12, v44
-; CHECK-NEXT: v_accvgpr_write_b32 a13, v45
-; CHECK-NEXT: v_accvgpr_write_b32 a14, v46
-; CHECK-NEXT: v_accvgpr_write_b32 a15, v47
-; CHECK-NEXT: v_accvgpr_write_b32 a16, v48
-; CHECK-NEXT: v_accvgpr_write_b32 a17, v49
-; CHECK-NEXT: v_accvgpr_write_b32 a18, v50
-; CHECK-NEXT: v_accvgpr_write_b32 a19, v51
-; CHECK-NEXT: v_accvgpr_write_b32 a20, v52
-; CHECK-NEXT: v_accvgpr_write_b32 a21, v53
-; CHECK-NEXT: v_accvgpr_write_b32 a22, v54
-; CHECK-NEXT: v_accvgpr_write_b32 a23, v55
-; CHECK-NEXT: v_accvgpr_write_b32 a24, v56
-; CHECK-NEXT: v_accvgpr_write_b32 a25, v57
-; CHECK-NEXT: v_accvgpr_write_b32 a26, v58
-; CHECK-NEXT: v_accvgpr_write_b32 a27, v59
-; CHECK-NEXT: v_accvgpr_write_b32 a28, v60
-; CHECK-NEXT: v_accvgpr_write_b32 a29, v61
-; CHECK-NEXT: v_accvgpr_write_b32 a30, v62
-; CHECK-NEXT: v_accvgpr_write_b32 a31, v63
-; CHECK-NEXT: v_mov_b32_e32 v33, 0x41000000
-; CHECK-NEXT: v_mov_b32_e32 v34, 0x41800000
-; CHECK-NEXT: v_accvgpr_read_b32 v32, a32
-; CHECK-NEXT: v_and_b32_e32 v32, 0x3ff, v32
-; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v33, v34, a[0:31]
-; CHECK-NEXT: v_lshlrev_b32_e32 v32, 7, v32
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
-; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
-; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
-; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
-; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
-; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
-; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
-; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
-; CHECK-NEXT: s_nop 7
; CHECK-NEXT: v_accvgpr_read_b32 v0, a0
-; CHECK-NEXT: v_accvgpr_read_b32 v24, a24
-; CHECK-NEXT: v_accvgpr_read_b32 v25, a25
-; CHECK-NEXT: v_accvgpr_read_b32 v26, a26
-; CHECK-NEXT: v_accvgpr_read_b32 v27, a27
; CHECK-NEXT: v_accvgpr_read_b32 v1, a1
; CHECK-NEXT: v_accvgpr_read_b32 v2, a2
; CHECK-NEXT: v_accvgpr_read_b32 v3, a3
@@ -663,18 +576,60 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr add
; CHECK-NEXT: v_accvgpr_read_b32 v21, a21
; CHECK-NEXT: v_accvgpr_read_b32 v22, a22
; CHECK-NEXT: v_accvgpr_read_b32 v23, a23
+; CHECK-NEXT: v_accvgpr_read_b32 v24, a24
+; CHECK-NEXT: v_accvgpr_read_b32 v25, a25
+; CHECK-NEXT: v_accvgpr_read_b32 v26, a26
+; CHECK-NEXT: v_accvgpr_read_b32 v27, a27
; CHECK-NEXT: v_accvgpr_read_b32 v28, a28
; CHECK-NEXT: v_accvgpr_read_b32 v29, a29
; CHECK-NEXT: v_accvgpr_read_b32 v30, a30
; CHECK-NEXT: v_accvgpr_read_b32 v31, a31
-; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:96
-; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:112
-; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:64
-; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:80
-; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:32
-; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:48
-; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3]
-; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:16
+; CHECK-NEXT: v_accvgpr_write_b32 a33, 4.0
+; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[32:63], a34, a33, v[0:31]
+; CHECK-NEXT: v_mov_b32_e32 v1, 0x41000000
+; CHECK-NEXT: v_accvgpr_read_b32 v0, a32
+; CHECK-NEXT: s_nop 15
+; CHECK-NEXT: v_mov_b64_e32 v[2:3], v[32:33]
+; CHECK-NEXT: v_mov_b64_e32 v[4:5], v[34:35]
+; CHECK-NEXT: v_mov_b64_e32 v[6:7], v[36:37]
+; CHECK-NEXT: v_mov_b64_e32 v[8:9], v[38:39]
+; CHECK-NEXT: v_mov_b64_e32 v[10:11], v[40:41]
+; CHECK-NEXT: v_mov_b64_e32 v[12:13], v[42:43]
+; CHECK-NEXT: v_mov_b64_e32 v[14:15], v[44:45]
+; CHECK-NEXT: v_mov_b64_e32 v[16:17], v[46:47]
+; CHECK-NEXT: v_mov_b64_e32 v[18:19], v[48:49]
+; CHECK-NEXT: v_mov_b64_e32 v[20:21], v[50:51]
+; CHECK-NEXT: v_mov_b64_e32 v[22:23], v[52:53]
+; CHECK-NEXT: v_mov_b64_e32 v[24:25], v[54:55]
+; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[56:57]
+; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[58:59]
+; CHECK-NEXT: v_mov_b64_e32 v[30:31], v[60:61]
+; CHECK-NEXT: v_mov_b64_e32 v[32:33], v[62:63]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v0, v[30:33], s[0:1] offset:112
+; CHECK-NEXT: global_store_dwordx4 v0, v[26:29], s[0:1] offset:96
+; CHECK-NEXT: global_store_dwordx4 v0, v[22:25], s[0:1] offset:80
+; CHECK-NEXT: global_store_dwordx4 v0, v[18:21], s[0:1] offset:64
+; CHECK-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] offset:48
+; CHECK-NEXT: global_store_dwordx4 v0, v[10:13], s[0:1] offset:32
+; CHECK-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
+; CHECK-NEXT: s_nop 1
+; CHECK-NEXT: v_mov_b32_e32 v2, 0x41800000
+; CHECK-NEXT: s_nop 1
+; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31]
+; CHECK-NEXT: s_nop 15
+; CHECK-NEXT: s_nop 1
+; CHECK-NEXT: global_store_dwordx4 v0, a[24:27], s[2:3] offset:96
+; CHECK-NEXT: global_store_dwordx4 v0, a[28:31], s[2:3] offset:112
+; CHECK-NEXT: global_store_dwordx4 v0, a[16:19], s[2:3] offset:64
+; CHECK-NEXT: global_store_dwordx4 v0, a[20:23], s[2:3] offset:80
+; CHECK-NEXT: global_store_dwordx4 v0, a[8:11], s[2:3] offset:32
+; CHECK-NEXT: global_store_dwordx4 v0, a[12:15], s[2:3] offset:48
+; CHECK-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3]
+; CHECK-NEXT: global_store_dwordx4 v0, a[4:7], s[2:3] offset:16
; CHECK-NEXT: s_endpgm
%src2 = call <32 x float> asm sideeffect "; def $0", "=a"()
%mai0 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 2.0, float 4.0, <32 x float> %src2, i32 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/sched.group.classification.mir b/llvm/test/CodeGen/AMDGPU/sched.group.classification.mir
new file mode 100644
index 0000000..a4aad57
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sched.group.classification.mir
@@ -0,0 +1,59 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -run-pass=machine-scheduler -o - %s | FileCheck %s
+
+---
+name: buffer_load_lds_not_valu
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+ ; CHECK-LABEL: name: buffer_load_lds_not_valu
+ ; CHECK: liveins: $vgpr0_vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $exec = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF2]], [[DEF3]], implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF3]], [[V_ADD_U32_e32_]], implicit $exec
+ ; CHECK-NEXT: $m0 = S_MOV_B32 0
+ ; CHECK-NEXT: BUFFER_LOAD_DWORDX4_LDS_OFFEN [[DEF]], [[DEF1]], 0, 0, 0, 0, implicit $exec, implicit $m0
+ ; CHECK-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_]], [[V_ADD_U32_e32_1]], implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_1]], [[V_ADD_U32_e32_2]], implicit $exec
+ ; CHECK-NEXT: $m0 = S_MOV_B32 1
+ ; CHECK-NEXT: BUFFER_LOAD_DWORDX4_LDS_OFFEN [[DEF]], [[DEF1]], 0, 0, 0, 0, implicit $exec, implicit $m0
+ ; CHECK-NEXT: [[V_ADD_U32_e32_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]], implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e32_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_3]], [[V_ADD_U32_e32_4]], implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e32_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_4]], [[V_ADD_U32_e32_5]], implicit $exec
+ ; CHECK-NEXT: dead [[V_ADD_U32_e32_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_5]], [[V_ADD_U32_e32_6]], implicit $exec
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 2, 0
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 4, 1, 0
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 2, 0
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 4, 1, 0
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 4, 0
+ ; CHECK-NEXT: S_ENDPGM 0
+ $exec = IMPLICIT_DEF
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:sgpr_128 = IMPLICIT_DEF
+ %2:vgpr_32 = IMPLICIT_DEF
+ %3:vgpr_32 = IMPLICIT_DEF
+ %4:vgpr_32 = V_ADD_U32_e32 %2, %3, implicit $exec
+ %5:vgpr_32 = V_ADD_U32_e32 %3, %4, implicit $exec
+ $m0 = S_MOV_B32 0
+ BUFFER_LOAD_DWORDX4_LDS_OFFEN %0, %1, 0, 0, 0, 0, implicit $exec, implicit $m0
+ $m0 = S_MOV_B32 1
+ BUFFER_LOAD_DWORDX4_LDS_OFFEN %0, %1, 0, 0, 0, 0, implicit $exec, implicit $m0
+ %6:vgpr_32 = V_ADD_U32_e32 %4, %5, implicit $exec
+ %7:vgpr_32 = V_ADD_U32_e32 %5, %6, implicit $exec
+ %8:vgpr_32 = V_ADD_U32_e32 %6, %7, implicit $exec
+ %9:vgpr_32 = V_ADD_U32_e32 %7, %8, implicit $exec
+ %10:vgpr_32 = V_ADD_U32_e32 %8, %9, implicit $exec
+ %11:vgpr_32 = V_ADD_U32_e32 %9, %10, implicit $exec
+ SCHED_GROUP_BARRIER 2, 2, 0
+ SCHED_GROUP_BARRIER 4, 1 ,0
+ SCHED_GROUP_BARRIER 2, 2, 0
+ SCHED_GROUP_BARRIER 4, 1 ,0
+ SCHED_GROUP_BARRIER 2, 4, 0
+ S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-pending-queue.mir b/llvm/test/CodeGen/AMDGPU/schedule-pending-queue.mir
new file mode 100644
index 0000000..33b2f69
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/schedule-pending-queue.mir
@@ -0,0 +1,32 @@
+# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass machine-scheduler --misched-prera-direction=topdown -verify-machineinstrs %s -o - -debug-only=machine-scheduler 2>&1 | FileCheck %s
+# REQUIRES: asserts
+
+# Check that cycle counts are consistent with hazards.
+
+# CHECK: Cycle: 3 TopQ.A
+# CHECK: hazard: SU(6) HWXDL[0]=9c, is later than CurrCycle = 3c
+# CHECK-NOT: Cycle: 9 TopQ.A
+# CHECK: Cycle: 83 TopQ.A
+# CHECK: Checking pending node SU(6)
+# CHECK: Move SU(6) into Available Q
+
+---
+name: pending_queue_ready_cycle
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr4_sgpr5
+
+ %2:sgpr_128 = IMPLICIT_DEF
+ %14:vgpr_32 = IMPLICIT_DEF
+ %15:vgpr_32 = IMPLICIT_DEF
+ %18:areg_512 = IMPLICIT_DEF
+ %18:areg_512 = V_MFMA_F32_16X16X1F32_mac_e64 %15, %14, %18, 0, 0, 0, implicit $mode, implicit $exec
+ %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, implicit $exec
+ %18:areg_512 = V_MFMA_F32_16X16X1F32_mac_e64 %15, %14, %18, 0, 0, 0, implicit $mode, implicit $exec
+ undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %5.sub0, %14, implicit $exec
+ %7:vreg_512 = COPY %18
+ SCHED_BARRIER 0
+ S_NOP 0, implicit %18, implicit %7, implicit %84
+ S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll
index 5c0f813..441509b 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll
@@ -391,156 +391,144 @@ define amdgpu_kernel void @slow_sdiv_i32_3435(ptr addrspace(1) %out, ptr addrspa
define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GCN-LABEL: sdiv_v2i32:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s10, s2
-; GCN-NEXT: s_mov_b32 s11, s3
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s8, s6
-; GCN-NEXT: s_mov_b32 s9, s7
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_readfirstlane_b32 s0, v2
-; GCN-NEXT: s_abs_i32 s1, s0
-; GCN-NEXT: v_cvt_f32_u32_e32 v2, s1
-; GCN-NEXT: s_sub_i32 s6, 0, s1
-; GCN-NEXT: v_readfirstlane_b32 s8, v3
-; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2
-; GCN-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT: v_mul_lo_u32 v4, s6, v2
-; GCN-NEXT: v_readfirstlane_b32 s6, v0
-; GCN-NEXT: s_abs_i32 s7, s6
-; GCN-NEXT: s_xor_b32 s0, s6, s0
-; GCN-NEXT: v_mul_hi_u32 v4, v2, v4
-; GCN-NEXT: s_ashr_i32 s6, s0, 31
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v4
-; GCN-NEXT: v_mul_hi_u32 v0, s7, v0
-; GCN-NEXT: v_readfirstlane_b32 s0, v0
-; GCN-NEXT: s_mul_i32 s0, s0, s1
-; GCN-NEXT: s_sub_i32 s0, s7, s0
-; GCN-NEXT: s_sub_i32 s7, s0, s1
-; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0
-; GCN-NEXT: s_cmp_ge_u32 s0, s1
-; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT: s_cselect_b32 s0, s7, s0
-; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0
-; GCN-NEXT: s_cmp_ge_u32 s0, s1
-; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_abs_i32 s7, s8
-; GCN-NEXT: v_cvt_f32_u32_e32 v3, s7
-; GCN-NEXT: s_mov_b32 s0, s4
-; GCN-NEXT: s_sub_i32 s4, 0, s7
-; GCN-NEXT: s_mov_b32 s1, s5
-; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT: v_xor_b32_e32 v0, s6, v0
-; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0
-; GCN-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3
-; GCN-NEXT: v_mul_lo_u32 v4, s4, v3
-; GCN-NEXT: v_readfirstlane_b32 s4, v1
-; GCN-NEXT: s_xor_b32 s5, s4, s8
-; GCN-NEXT: s_abs_i32 s4, s4
-; GCN-NEXT: v_mul_hi_u32 v1, v3, v4
-; GCN-NEXT: s_ashr_i32 s5, s5, 31
-; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1
-; GCN-NEXT: v_mul_hi_u32 v1, s4, v1
-; GCN-NEXT: v_readfirstlane_b32 s6, v1
-; GCN-NEXT: s_mul_i32 s6, s6, s7
-; GCN-NEXT: s_sub_i32 s4, s4, s6
-; GCN-NEXT: s_sub_i32 s6, s4, s7
-; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v1
-; GCN-NEXT: s_cmp_ge_u32 s4, s7
-; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-NEXT: s_cselect_b32 s4, s6, s4
-; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v1
-; GCN-NEXT: s_cmp_ge_u32 s4, s7
-; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-NEXT: v_xor_b32_e32 v1, s5, v1
-; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s5, v1
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
+; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v3
+; GCN-NEXT: v_xor_b32_e32 v4, v0, v2
+; GCN-NEXT: v_xor_b32_e32 v7, v1, v3
+; GCN-NEXT: v_max_i32_e32 v2, v2, v6
+; GCN-NEXT: v_max_i32_e32 v3, v3, v9
+; GCN-NEXT: v_cvt_f32_u32_e32 v6, v2
+; GCN-NEXT: v_cvt_f32_u32_e32 v9, v3
+; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v0
+; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v6
+; GCN-NEXT: v_max_i32_e32 v0, v0, v5
+; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v9
+; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v2
+; GCN-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
+; GCN-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
+; GCN-NEXT: v_cvt_u32_f32_e32 v6, v6
+; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5
+; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v3
+; GCN-NEXT: v_mul_lo_u32 v9, v9, v6
+; GCN-NEXT: v_mul_lo_u32 v10, v10, v5
+; GCN-NEXT: v_sub_i32_e32 v8, vcc, 0, v1
+; GCN-NEXT: v_mul_hi_u32 v9, v6, v9
+; GCN-NEXT: v_max_i32_e32 v1, v1, v8
+; GCN-NEXT: v_mul_hi_u32 v8, v5, v10
+; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v4
+; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v9
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v8
+; GCN-NEXT: v_mul_hi_u32 v6, v0, v6
+; GCN-NEXT: v_mul_hi_u32 v5, v1, v5
+; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v7
+; GCN-NEXT: v_mul_lo_u32 v8, v6, v2
+; GCN-NEXT: v_mul_lo_u32 v10, v5, v3
+; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v6
+; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
+; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v10
+; GCN-NEXT: v_add_i32_e32 v11, vcc, 1, v5
+; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2
+; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3
+; GCN-NEXT: v_sub_i32_e32 v8, vcc, v0, v2
+; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1]
+; GCN-NEXT: v_sub_i32_e32 v9, vcc, v1, v3
+; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[2:3]
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[0:1]
+; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v6
+; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[2:3]
+; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v5
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
+; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
+; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
+; GCN-NEXT: v_xor_b32_e32 v0, v0, v4
+; GCN-NEXT: v_xor_b32_e32 v1, v1, v7
+; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
+; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v7
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NEXT: s_endpgm
;
; TONGA-LABEL: sdiv_v2i32:
; TONGA: ; %bb.0:
-; TONGA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
-; TONGA-NEXT: s_mov_b32 s3, 0xf000
-; TONGA-NEXT: s_mov_b32 s2, -1
-; TONGA-NEXT: s_mov_b32 s10, s2
-; TONGA-NEXT: s_mov_b32 s11, s3
+; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; TONGA-NEXT: s_mov_b32 s7, 0xf000
+; TONGA-NEXT: s_mov_b32 s6, -1
+; TONGA-NEXT: s_mov_b32 s10, s6
+; TONGA-NEXT: s_mov_b32 s11, s7
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
-; TONGA-NEXT: s_mov_b32 s8, s6
-; TONGA-NEXT: s_mov_b32 s9, s7
+; TONGA-NEXT: s_mov_b32 s8, s2
+; TONGA-NEXT: s_mov_b32 s9, s3
; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; TONGA-NEXT: s_mov_b32 s4, s0
+; TONGA-NEXT: s_mov_b32 s5, s1
; TONGA-NEXT: s_waitcnt vmcnt(0)
-; TONGA-NEXT: v_readfirstlane_b32 s0, v2
-; TONGA-NEXT: s_abs_i32 s1, s0
-; TONGA-NEXT: v_cvt_f32_u32_e32 v2, s1
-; TONGA-NEXT: s_sub_i32 s6, 0, s1
-; TONGA-NEXT: v_readfirstlane_b32 s8, v3
-; TONGA-NEXT: v_rcp_iflag_f32_e32 v2, v2
-; TONGA-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; TONGA-NEXT: v_cvt_u32_f32_e32 v2, v2
-; TONGA-NEXT: v_mul_lo_u32 v4, s6, v2
-; TONGA-NEXT: v_readfirstlane_b32 s6, v0
-; TONGA-NEXT: s_abs_i32 s7, s6
-; TONGA-NEXT: s_xor_b32 s0, s6, s0
-; TONGA-NEXT: v_mul_hi_u32 v4, v2, v4
-; TONGA-NEXT: s_ashr_i32 s6, s0, 31
-; TONGA-NEXT: v_add_u32_e32 v0, vcc, v2, v4
-; TONGA-NEXT: v_mul_hi_u32 v0, s7, v0
-; TONGA-NEXT: v_readfirstlane_b32 s0, v0
-; TONGA-NEXT: s_mul_i32 s0, s0, s1
-; TONGA-NEXT: s_sub_i32 s0, s7, s0
-; TONGA-NEXT: s_sub_i32 s7, s0, s1
-; TONGA-NEXT: v_add_u32_e32 v2, vcc, 1, v0
-; TONGA-NEXT: s_cmp_ge_u32 s0, s1
-; TONGA-NEXT: s_cselect_b64 vcc, -1, 0
-; TONGA-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; TONGA-NEXT: s_cselect_b32 s0, s7, s0
-; TONGA-NEXT: v_add_u32_e32 v2, vcc, 1, v0
-; TONGA-NEXT: s_cmp_ge_u32 s0, s1
-; TONGA-NEXT: s_cselect_b64 vcc, -1, 0
-; TONGA-NEXT: s_abs_i32 s7, s8
-; TONGA-NEXT: v_cvt_f32_u32_e32 v3, s7
-; TONGA-NEXT: s_mov_b32 s0, s4
-; TONGA-NEXT: s_sub_i32 s4, 0, s7
-; TONGA-NEXT: s_mov_b32 s1, s5
-; TONGA-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; TONGA-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; TONGA-NEXT: v_xor_b32_e32 v0, s6, v0
-; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, s6, v0
-; TONGA-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; TONGA-NEXT: v_cvt_u32_f32_e32 v3, v3
-; TONGA-NEXT: v_mul_lo_u32 v4, s4, v3
-; TONGA-NEXT: v_readfirstlane_b32 s4, v1
-; TONGA-NEXT: s_xor_b32 s5, s4, s8
-; TONGA-NEXT: s_abs_i32 s4, s4
-; TONGA-NEXT: v_mul_hi_u32 v1, v3, v4
-; TONGA-NEXT: s_ashr_i32 s5, s5, 31
-; TONGA-NEXT: v_add_u32_e32 v1, vcc, v3, v1
-; TONGA-NEXT: v_mul_hi_u32 v1, s4, v1
-; TONGA-NEXT: v_readfirstlane_b32 s6, v1
-; TONGA-NEXT: s_mul_i32 s6, s6, s7
-; TONGA-NEXT: s_sub_i32 s4, s4, s6
-; TONGA-NEXT: s_sub_i32 s6, s4, s7
-; TONGA-NEXT: v_add_u32_e32 v2, vcc, 1, v1
-; TONGA-NEXT: s_cmp_ge_u32 s4, s7
-; TONGA-NEXT: s_cselect_b64 vcc, -1, 0
-; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; TONGA-NEXT: s_cselect_b32 s4, s6, s4
-; TONGA-NEXT: v_add_u32_e32 v2, vcc, 1, v1
-; TONGA-NEXT: s_cmp_ge_u32 s4, s7
-; TONGA-NEXT: s_cselect_b64 vcc, -1, 0
-; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; TONGA-NEXT: v_xor_b32_e32 v1, s5, v1
-; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, s5, v1
-; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; TONGA-NEXT: v_sub_u32_e32 v6, vcc, 0, v2
+; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v3
+; TONGA-NEXT: v_xor_b32_e32 v4, v0, v2
+; TONGA-NEXT: v_xor_b32_e32 v7, v1, v3
+; TONGA-NEXT: v_max_i32_e32 v2, v2, v6
+; TONGA-NEXT: v_max_i32_e32 v3, v3, v9
+; TONGA-NEXT: v_cvt_f32_u32_e32 v6, v2
+; TONGA-NEXT: v_cvt_f32_u32_e32 v9, v3
+; TONGA-NEXT: v_sub_u32_e32 v5, vcc, 0, v0
+; TONGA-NEXT: v_rcp_iflag_f32_e32 v6, v6
+; TONGA-NEXT: v_max_i32_e32 v0, v0, v5
+; TONGA-NEXT: v_rcp_iflag_f32_e32 v5, v9
+; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v2
+; TONGA-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
+; TONGA-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
+; TONGA-NEXT: v_cvt_u32_f32_e32 v6, v6
+; TONGA-NEXT: v_cvt_u32_f32_e32 v5, v5
+; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v3
+; TONGA-NEXT: v_mul_lo_u32 v9, v9, v6
+; TONGA-NEXT: v_mul_lo_u32 v10, v10, v5
+; TONGA-NEXT: v_sub_u32_e32 v8, vcc, 0, v1
+; TONGA-NEXT: v_mul_hi_u32 v9, v6, v9
+; TONGA-NEXT: v_max_i32_e32 v1, v1, v8
+; TONGA-NEXT: v_mul_hi_u32 v8, v5, v10
+; TONGA-NEXT: v_ashrrev_i32_e32 v4, 31, v4
+; TONGA-NEXT: v_add_u32_e32 v6, vcc, v6, v9
+; TONGA-NEXT: v_add_u32_e32 v5, vcc, v5, v8
+; TONGA-NEXT: v_mul_hi_u32 v6, v0, v6
+; TONGA-NEXT: v_mul_hi_u32 v5, v1, v5
+; TONGA-NEXT: v_ashrrev_i32_e32 v7, 31, v7
+; TONGA-NEXT: v_mul_lo_u32 v8, v6, v2
+; TONGA-NEXT: v_mul_lo_u32 v10, v5, v3
+; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v6
+; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v8
+; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v10
+; TONGA-NEXT: v_add_u32_e32 v11, vcc, 1, v5
+; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2
+; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3
+; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v0, v2
+; TONGA-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1]
+; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v1, v3
+; TONGA-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[2:3]
+; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[0:1]
+; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v6
+; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[2:3]
+; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v5
+; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
+; TONGA-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc
+; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
+; TONGA-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
+; TONGA-NEXT: v_xor_b32_e32 v0, v0, v4
+; TONGA-NEXT: v_xor_b32_e32 v1, v1, v7
+; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v4
+; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v7
+; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; TONGA-NEXT: s_endpgm
;
; GFX9-LABEL: sdiv_v2i32:
@@ -558,44 +546,44 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_readfirstlane_b32 s0, v2
; GFX9-NEXT: s_abs_i32 s1, s0
; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s1
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_xor_b32 s0, s4, s0
+; GFX9-NEXT: v_readfirstlane_b32 s5, v0
+; GFX9-NEXT: s_xor_b32 s0, s5, s0
; GFX9-NEXT: s_ashr_i32 s6, s0, 31
; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GFX9-NEXT: s_sub_i32 s0, 0, s1
-; GFX9-NEXT: s_abs_i32 s4, s4
-; GFX9-NEXT: v_readfirstlane_b32 s5, v3
+; GFX9-NEXT: s_abs_i32 s5, s5
+; GFX9-NEXT: v_readfirstlane_b32 s4, v3
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_readfirstlane_b32 s7, v0
; GFX9-NEXT: s_mul_i32 s0, s0, s7
; GFX9-NEXT: s_mul_hi_u32 s0, s7, s0
; GFX9-NEXT: s_add_i32 s7, s7, s0
-; GFX9-NEXT: s_mul_hi_u32 s0, s4, s7
+; GFX9-NEXT: s_mul_hi_u32 s0, s5, s7
; GFX9-NEXT: s_mul_i32 s7, s0, s1
-; GFX9-NEXT: s_sub_i32 s4, s4, s7
+; GFX9-NEXT: s_sub_i32 s5, s5, s7
; GFX9-NEXT: s_add_i32 s10, s0, 1
-; GFX9-NEXT: s_sub_i32 s7, s4, s1
-; GFX9-NEXT: s_cmp_ge_u32 s4, s1
+; GFX9-NEXT: s_sub_i32 s7, s5, s1
+; GFX9-NEXT: s_cmp_ge_u32 s5, s1
; GFX9-NEXT: s_cselect_b32 s0, s10, s0
-; GFX9-NEXT: s_cselect_b32 s4, s7, s4
+; GFX9-NEXT: s_cselect_b32 s5, s7, s5
; GFX9-NEXT: s_add_i32 s7, s0, 1
-; GFX9-NEXT: s_cmp_ge_u32 s4, s1
-; GFX9-NEXT: s_cselect_b32 s4, s7, s0
-; GFX9-NEXT: s_abs_i32 s7, s5
+; GFX9-NEXT: s_cmp_ge_u32 s5, s1
+; GFX9-NEXT: s_cselect_b32 s5, s7, s0
+; GFX9-NEXT: s_abs_i32 s7, s4
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7
-; GFX9-NEXT: s_xor_b32 s4, s4, s6
+; GFX9-NEXT: s_xor_b32 s5, s5, s6
; GFX9-NEXT: s_mov_b32 s1, s9
; GFX9-NEXT: s_sub_i32 s9, 0, s7
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_sub_i32 s4, s4, s6
+; GFX9-NEXT: s_sub_i32 s5, s5, s6
; GFX9-NEXT: s_mov_b32 s0, s8
; GFX9-NEXT: v_readfirstlane_b32 s8, v1
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: s_xor_b32 s5, s8, s5
+; GFX9-NEXT: s_xor_b32 s4, s8, s4
; GFX9-NEXT: s_abs_i32 s8, s8
-; GFX9-NEXT: s_ashr_i32 s5, s5, 31
+; GFX9-NEXT: s_ashr_i32 s4, s4, 31
; GFX9-NEXT: v_readfirstlane_b32 s6, v0
; GFX9-NEXT: s_mul_i32 s9, s9, s6
; GFX9-NEXT: s_mul_hi_u32 s9, s6, s9
@@ -611,10 +599,10 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_add_i32 s9, s6, 1
; GFX9-NEXT: s_cmp_ge_u32 s8, s7
; GFX9-NEXT: s_cselect_b32 s6, s9, s6
-; GFX9-NEXT: s_xor_b32 s6, s6, s5
-; GFX9-NEXT: s_sub_i32 s5, s6, s5
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: s_xor_b32 s6, s6, s4
+; GFX9-NEXT: s_sub_i32 s4, s6, s4
+; GFX9-NEXT: v_mov_b32_e32 v0, s5
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -804,255 +792,255 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-LABEL: sdiv_v4i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NEXT: s_mov_b32 s11, 0xf000
-; GCN-NEXT: s_mov_b32 s10, -1
-; GCN-NEXT: s_mov_b32 s6, s10
-; GCN-NEXT: s_mov_b32 s7, s11
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s4, s2
-; GCN-NEXT: s_mov_b32 s5, s3
-; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16
-; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0
-; GCN-NEXT: s_mov_b32 s8, s0
-; GCN-NEXT: s_mov_b32 s9, s1
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_readfirstlane_b32 s0, v0
-; GCN-NEXT: v_readfirstlane_b32 s1, v1
-; GCN-NEXT: v_readfirstlane_b32 s2, v2
-; GCN-NEXT: s_abs_i32 s13, s0
-; GCN-NEXT: s_abs_i32 s14, s1
-; GCN-NEXT: s_abs_i32 s15, s2
-; GCN-NEXT: v_cvt_f32_u32_e32 v0, s13
-; GCN-NEXT: v_cvt_f32_u32_e32 v1, s14
-; GCN-NEXT: v_cvt_f32_u32_e32 v2, s15
-; GCN-NEXT: v_readfirstlane_b32 s6, v3
+; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v4
+; GCN-NEXT: v_xor_b32_e32 v8, v0, v4
+; GCN-NEXT: v_max_i32_e32 v4, v4, v10
+; GCN-NEXT: v_cvt_f32_u32_e32 v10, v4
+; GCN-NEXT: v_sub_i32_e32 v13, vcc, 0, v5
+; GCN-NEXT: v_xor_b32_e32 v11, v1, v5
+; GCN-NEXT: v_rcp_iflag_f32_e32 v10, v10
+; GCN-NEXT: v_max_i32_e32 v5, v5, v13
+; GCN-NEXT: v_cvt_f32_u32_e32 v13, v5
+; GCN-NEXT: v_sub_i32_e32 v16, vcc, 0, v4
+; GCN-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10
+; GCN-NEXT: v_cvt_u32_f32_e32 v10, v10
+; GCN-NEXT: v_rcp_iflag_f32_e32 v13, v13
+; GCN-NEXT: v_sub_i32_e32 v12, vcc, 0, v1
+; GCN-NEXT: v_mul_lo_u32 v16, v16, v10
+; GCN-NEXT: v_mul_f32_e32 v13, 0x4f7ffffe, v13
+; GCN-NEXT: v_cvt_u32_f32_e32 v13, v13
+; GCN-NEXT: v_max_i32_e32 v0, v0, v9
+; GCN-NEXT: v_mul_hi_u32 v16, v10, v16
+; GCN-NEXT: v_max_i32_e32 v1, v1, v12
+; GCN-NEXT: v_sub_i32_e32 v15, vcc, 0, v6
+; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v16
+; GCN-NEXT: v_sub_i32_e32 v16, vcc, 0, v5
+; GCN-NEXT: v_mul_lo_u32 v16, v16, v13
+; GCN-NEXT: v_mul_hi_u32 v10, v0, v10
+; GCN-NEXT: v_xor_b32_e32 v14, v2, v6
+; GCN-NEXT: v_max_i32_e32 v6, v6, v15
+; GCN-NEXT: v_mul_hi_u32 v12, v13, v16
+; GCN-NEXT: v_cvt_f32_u32_e32 v15, v6
+; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v8
+; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v11
+; GCN-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GCN-NEXT: v_mul_lo_u32 v13, v10, v4
+; GCN-NEXT: v_mul_hi_u32 v12, v1, v12
+; GCN-NEXT: v_rcp_iflag_f32_e32 v9, v15
+; GCN-NEXT: v_ashrrev_i32_e32 v14, 31, v14
+; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v13
+; GCN-NEXT: v_add_i32_e32 v13, vcc, 1, v10
+; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4
+; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v13, s[0:1]
+; GCN-NEXT: v_sub_i32_e32 v13, vcc, v0, v4
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v13, s[0:1]
+; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4
+; GCN-NEXT: v_mul_lo_u32 v0, v12, v5
+; GCN-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9
+; GCN-NEXT: v_cvt_u32_f32_e32 v9, v9
+; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v6
+; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, 1, v12
+; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5
+; GCN-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[2:3]
+; GCN-NEXT: v_sub_i32_e32 v12, vcc, v0, v5
+; GCN-NEXT: v_mul_lo_u32 v4, v4, v9
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[2:3]
+; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5
+; GCN-NEXT: v_sub_i32_e32 v0, vcc, 0, v7
+; GCN-NEXT: v_max_i32_e32 v5, v7, v0
+; GCN-NEXT: v_cvt_f32_u32_e32 v0, v5
+; GCN-NEXT: v_mul_hi_u32 v4, v9, v4
+; GCN-NEXT: v_add_i32_e32 v13, vcc, 1, v10
; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1
-; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2
-; GCN-NEXT: s_abs_i32 s17, s6
-; GCN-NEXT: v_cvt_f32_u32_e32 v3, s17
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v9, v4
+; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v2
+; GCN-NEXT: v_max_i32_e32 v2, v2, v9
+; GCN-NEXT: v_mul_hi_u32 v4, v2, v4
; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; GCN-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_readfirstlane_b32 s3, v4
-; GCN-NEXT: v_readfirstlane_b32 s4, v5
-; GCN-NEXT: v_readfirstlane_b32 s5, v6
-; GCN-NEXT: s_xor_b32 s12, s3, s0
-; GCN-NEXT: s_xor_b32 s0, s4, s1
-; GCN-NEXT: s_xor_b32 s1, s5, s2
-; GCN-NEXT: s_sub_i32 s2, 0, s13
-; GCN-NEXT: s_ashr_i32 s18, s0, 31
-; GCN-NEXT: s_sub_i32 s0, 0, s14
-; GCN-NEXT: s_ashr_i32 s19, s1, 31
-; GCN-NEXT: s_sub_i32 s1, 0, s15
-; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; GCN-NEXT: v_mul_lo_u32 v4, s2, v0
-; GCN-NEXT: v_mul_lo_u32 v5, s0, v1
-; GCN-NEXT: v_mul_lo_u32 v6, s1, v2
-; GCN-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3
-; GCN-NEXT: v_mul_hi_u32 v4, v0, v4
-; GCN-NEXT: v_mul_hi_u32 v5, v1, v5
-; GCN-NEXT: v_mul_hi_u32 v6, v2, v6
-; GCN-NEXT: s_sub_i32 s20, 0, s17
-; GCN-NEXT: v_readfirstlane_b32 s7, v7
-; GCN-NEXT: s_abs_i32 s3, s3
-; GCN-NEXT: s_abs_i32 s4, s4
-; GCN-NEXT: s_abs_i32 s5, s5
-; GCN-NEXT: v_mul_lo_u32 v7, s20, v3
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v5
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6
-; GCN-NEXT: v_mul_hi_u32 v0, s3, v0
-; GCN-NEXT: v_mul_hi_u32 v1, s4, v1
-; GCN-NEXT: v_mul_hi_u32 v2, s5, v2
-; GCN-NEXT: v_mul_hi_u32 v7, v3, v7
-; GCN-NEXT: v_mul_lo_u32 v4, v0, s13
-; GCN-NEXT: v_mul_lo_u32 v6, v1, s14
-; GCN-NEXT: v_mul_lo_u32 v8, v2, s15
-; GCN-NEXT: s_abs_i32 s16, s7
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7
-; GCN-NEXT: v_mul_hi_u32 v3, s16, v3
-; GCN-NEXT: v_sub_i32_e32 v4, vcc, s3, v4
-; GCN-NEXT: v_sub_i32_e32 v6, vcc, s4, v6
-; GCN-NEXT: v_sub_i32_e32 v8, vcc, s5, v8
-; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v0
-; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v1
-; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v2
-; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4
-; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s14, v6
-; GCN-NEXT: v_cmp_le_u32_e64 s[4:5], s15, v8
-; GCN-NEXT: v_subrev_i32_e32 v10, vcc, s13, v4
-; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[0:1]
-; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s14, v6
-; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3]
-; GCN-NEXT: v_subrev_i32_e32 v7, vcc, s15, v8
-; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[4:5]
-; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1]
-; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[2:3]
-; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v1
-; GCN-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[4:5]
-; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v2
-; GCN-NEXT: v_cmp_le_u32_e32 vcc, s13, v4
-; GCN-NEXT: v_mul_lo_u32 v4, v3, s17
-; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
-; GCN-NEXT: v_cmp_le_u32_e32 vcc, s14, v5
-; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
-; GCN-NEXT: v_cmp_le_u32_e32 vcc, s15, v7
-; GCN-NEXT: s_ashr_i32 s12, s12, 31
-; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
-; GCN-NEXT: v_xor_b32_e32 v0, s12, v0
-; GCN-NEXT: v_xor_b32_e32 v1, s18, v1
-; GCN-NEXT: v_xor_b32_e32 v2, s19, v2
-; GCN-NEXT: v_sub_i32_e32 v4, vcc, s16, v4
-; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0
-; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s18, v1
-; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s19, v2
-; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v3
-; GCN-NEXT: v_subrev_i32_e32 v6, vcc, s17, v4
-; GCN-NEXT: v_cmp_le_u32_e32 vcc, s17, v4
-; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v3
-; GCN-NEXT: s_xor_b32 s0, s7, s6
-; GCN-NEXT: v_cmp_le_u32_e32 vcc, s17, v4
-; GCN-NEXT: s_ashr_i32 s0, s0, 31
-; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; GCN-NEXT: v_xor_b32_e32 v3, s0, v3
-; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s0, v3
-; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-NEXT: v_cvt_u32_f32_e32 v9, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, v10, v13, s[0:1]
+; GCN-NEXT: v_xor_b32_e32 v0, v0, v8
+; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
+; GCN-NEXT: v_mul_lo_u32 v8, v4, v6
+; GCN-NEXT: v_add_i32_e32 v12, vcc, 1, v1
+; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v5
+; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v8
+; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3]
+; GCN-NEXT: v_mul_lo_u32 v10, v10, v9
+; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v4
+; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v2, v6
+; GCN-NEXT: v_xor_b32_e32 v1, v1, v11
+; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1]
+; GCN-NEXT: v_sub_i32_e32 v8, vcc, v2, v6
+; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v11
+; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1]
+; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v4
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6
+; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v8, vcc
+; GCN-NEXT: v_mul_hi_u32 v4, v9, v10
+; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v3
+; GCN-NEXT: v_max_i32_e32 v6, v3, v6
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v9, v4
+; GCN-NEXT: v_mul_hi_u32 v4, v6, v4
+; GCN-NEXT: v_xor_b32_e32 v2, v2, v14
+; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v14
+; GCN-NEXT: v_mul_lo_u32 v8, v4, v5
+; GCN-NEXT: v_xor_b32_e32 v3, v3, v7
+; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v4
+; GCN-NEXT: v_sub_i32_e32 v6, vcc, v6, v8
+; GCN-NEXT: v_sub_i32_e32 v8, vcc, v6, v5
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5
+; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
+; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v4
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5
+; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v3
+; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
+; GCN-NEXT: v_xor_b32_e32 v4, v4, v3
+; GCN-NEXT: v_sub_i32_e32 v3, vcc, v4, v3
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GCN-NEXT: s_endpgm
;
; TONGA-LABEL: sdiv_v4i32:
; TONGA: ; %bb.0:
; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; TONGA-NEXT: s_mov_b32 s11, 0xf000
-; TONGA-NEXT: s_mov_b32 s10, -1
-; TONGA-NEXT: s_mov_b32 s6, s10
-; TONGA-NEXT: s_mov_b32 s7, s11
+; TONGA-NEXT: s_mov_b32 s7, 0xf000
+; TONGA-NEXT: s_mov_b32 s6, -1
+; TONGA-NEXT: s_mov_b32 s10, s6
+; TONGA-NEXT: s_mov_b32 s11, s7
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
-; TONGA-NEXT: s_mov_b32 s4, s2
-; TONGA-NEXT: s_mov_b32 s5, s3
-; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16
-; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0
-; TONGA-NEXT: s_mov_b32 s8, s0
-; TONGA-NEXT: s_mov_b32 s9, s1
+; TONGA-NEXT: s_mov_b32 s8, s2
+; TONGA-NEXT: s_mov_b32 s9, s3
+; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; TONGA-NEXT: s_mov_b32 s4, s0
+; TONGA-NEXT: s_mov_b32 s5, s1
; TONGA-NEXT: s_waitcnt vmcnt(1)
-; TONGA-NEXT: v_readfirstlane_b32 s0, v0
-; TONGA-NEXT: v_readfirstlane_b32 s1, v1
-; TONGA-NEXT: v_readfirstlane_b32 s2, v2
-; TONGA-NEXT: s_abs_i32 s13, s0
-; TONGA-NEXT: s_abs_i32 s14, s1
-; TONGA-NEXT: s_abs_i32 s15, s2
-; TONGA-NEXT: v_cvt_f32_u32_e32 v0, s13
-; TONGA-NEXT: v_cvt_f32_u32_e32 v1, s14
-; TONGA-NEXT: v_cvt_f32_u32_e32 v2, s15
-; TONGA-NEXT: v_readfirstlane_b32 s6, v3
+; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v0
+; TONGA-NEXT: s_waitcnt vmcnt(0)
+; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v4
+; TONGA-NEXT: v_xor_b32_e32 v8, v0, v4
+; TONGA-NEXT: v_max_i32_e32 v4, v4, v10
+; TONGA-NEXT: v_cvt_f32_u32_e32 v10, v4
+; TONGA-NEXT: v_sub_u32_e32 v13, vcc, 0, v5
+; TONGA-NEXT: v_xor_b32_e32 v11, v1, v5
+; TONGA-NEXT: v_rcp_iflag_f32_e32 v10, v10
+; TONGA-NEXT: v_max_i32_e32 v5, v5, v13
+; TONGA-NEXT: v_cvt_f32_u32_e32 v13, v5
+; TONGA-NEXT: v_sub_u32_e32 v16, vcc, 0, v4
+; TONGA-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10
+; TONGA-NEXT: v_cvt_u32_f32_e32 v10, v10
+; TONGA-NEXT: v_rcp_iflag_f32_e32 v13, v13
+; TONGA-NEXT: v_sub_u32_e32 v12, vcc, 0, v1
+; TONGA-NEXT: v_mul_lo_u32 v16, v16, v10
+; TONGA-NEXT: v_mul_f32_e32 v13, 0x4f7ffffe, v13
+; TONGA-NEXT: v_cvt_u32_f32_e32 v13, v13
+; TONGA-NEXT: v_max_i32_e32 v0, v0, v9
+; TONGA-NEXT: v_mul_hi_u32 v16, v10, v16
+; TONGA-NEXT: v_max_i32_e32 v1, v1, v12
+; TONGA-NEXT: v_sub_u32_e32 v15, vcc, 0, v6
+; TONGA-NEXT: v_add_u32_e32 v10, vcc, v10, v16
+; TONGA-NEXT: v_sub_u32_e32 v16, vcc, 0, v5
+; TONGA-NEXT: v_mul_lo_u32 v16, v16, v13
+; TONGA-NEXT: v_mul_hi_u32 v10, v0, v10
+; TONGA-NEXT: v_xor_b32_e32 v14, v2, v6
+; TONGA-NEXT: v_max_i32_e32 v6, v6, v15
+; TONGA-NEXT: v_mul_hi_u32 v12, v13, v16
+; TONGA-NEXT: v_cvt_f32_u32_e32 v15, v6
+; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v8
+; TONGA-NEXT: v_ashrrev_i32_e32 v11, 31, v11
+; TONGA-NEXT: v_add_u32_e32 v12, vcc, v13, v12
+; TONGA-NEXT: v_mul_lo_u32 v13, v10, v4
+; TONGA-NEXT: v_mul_hi_u32 v12, v1, v12
+; TONGA-NEXT: v_rcp_iflag_f32_e32 v9, v15
+; TONGA-NEXT: v_ashrrev_i32_e32 v14, 31, v14
+; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v13
+; TONGA-NEXT: v_add_u32_e32 v13, vcc, 1, v10
+; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4
+; TONGA-NEXT: v_cndmask_b32_e64 v10, v10, v13, s[0:1]
+; TONGA-NEXT: v_sub_u32_e32 v13, vcc, v0, v4
+; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v13, s[0:1]
+; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4
+; TONGA-NEXT: v_mul_lo_u32 v0, v12, v5
+; TONGA-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9
+; TONGA-NEXT: v_cvt_u32_f32_e32 v9, v9
+; TONGA-NEXT: v_sub_u32_e32 v4, vcc, 0, v6
+; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v0
+; TONGA-NEXT: v_add_u32_e32 v1, vcc, 1, v12
+; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5
+; TONGA-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[2:3]
+; TONGA-NEXT: v_sub_u32_e32 v12, vcc, v0, v5
+; TONGA-NEXT: v_mul_lo_u32 v4, v4, v9
+; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[2:3]
+; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5
+; TONGA-NEXT: v_sub_u32_e32 v0, vcc, 0, v7
+; TONGA-NEXT: v_max_i32_e32 v5, v7, v0
+; TONGA-NEXT: v_cvt_f32_u32_e32 v0, v5
+; TONGA-NEXT: v_mul_hi_u32 v4, v9, v4
+; TONGA-NEXT: v_add_u32_e32 v13, vcc, 1, v10
; TONGA-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; TONGA-NEXT: v_rcp_iflag_f32_e32 v1, v1
-; TONGA-NEXT: v_rcp_iflag_f32_e32 v2, v2
-; TONGA-NEXT: s_abs_i32 s17, s6
-; TONGA-NEXT: v_cvt_f32_u32_e32 v3, s17
+; TONGA-NEXT: v_add_u32_e32 v4, vcc, v9, v4
+; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v2
+; TONGA-NEXT: v_max_i32_e32 v2, v2, v9
+; TONGA-NEXT: v_mul_hi_u32 v4, v2, v4
; TONGA-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; TONGA-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; TONGA-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; TONGA-NEXT: v_cvt_u32_f32_e32 v0, v0
-; TONGA-NEXT: v_cvt_u32_f32_e32 v1, v1
-; TONGA-NEXT: v_cvt_u32_f32_e32 v2, v2
-; TONGA-NEXT: s_waitcnt vmcnt(0)
-; TONGA-NEXT: v_readfirstlane_b32 s3, v4
-; TONGA-NEXT: v_readfirstlane_b32 s4, v5
-; TONGA-NEXT: v_readfirstlane_b32 s5, v6
-; TONGA-NEXT: s_xor_b32 s12, s3, s0
-; TONGA-NEXT: s_xor_b32 s0, s4, s1
-; TONGA-NEXT: s_xor_b32 s1, s5, s2
-; TONGA-NEXT: s_sub_i32 s2, 0, s13
-; TONGA-NEXT: s_ashr_i32 s18, s0, 31
-; TONGA-NEXT: s_sub_i32 s0, 0, s14
-; TONGA-NEXT: s_ashr_i32 s19, s1, 31
-; TONGA-NEXT: s_sub_i32 s1, 0, s15
-; TONGA-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; TONGA-NEXT: v_mul_lo_u32 v4, s2, v0
-; TONGA-NEXT: v_mul_lo_u32 v5, s0, v1
-; TONGA-NEXT: v_mul_lo_u32 v6, s1, v2
-; TONGA-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; TONGA-NEXT: v_cvt_u32_f32_e32 v3, v3
-; TONGA-NEXT: v_mul_hi_u32 v4, v0, v4
-; TONGA-NEXT: v_mul_hi_u32 v5, v1, v5
-; TONGA-NEXT: v_mul_hi_u32 v6, v2, v6
-; TONGA-NEXT: s_sub_i32 s20, 0, s17
-; TONGA-NEXT: v_readfirstlane_b32 s7, v7
-; TONGA-NEXT: s_abs_i32 s3, s3
-; TONGA-NEXT: s_abs_i32 s4, s4
-; TONGA-NEXT: s_abs_i32 s5, s5
-; TONGA-NEXT: v_mul_lo_u32 v7, s20, v3
-; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v4
-; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v5
-; TONGA-NEXT: v_add_u32_e32 v2, vcc, v2, v6
-; TONGA-NEXT: v_mul_hi_u32 v0, s3, v0
-; TONGA-NEXT: v_mul_hi_u32 v1, s4, v1
-; TONGA-NEXT: v_mul_hi_u32 v2, s5, v2
-; TONGA-NEXT: v_mul_hi_u32 v7, v3, v7
-; TONGA-NEXT: v_mul_lo_u32 v4, v0, s13
-; TONGA-NEXT: v_mul_lo_u32 v6, v1, s14
-; TONGA-NEXT: v_mul_lo_u32 v8, v2, s15
-; TONGA-NEXT: s_abs_i32 s16, s7
-; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v7
-; TONGA-NEXT: v_mul_hi_u32 v3, s16, v3
-; TONGA-NEXT: v_sub_u32_e32 v4, vcc, s3, v4
-; TONGA-NEXT: v_sub_u32_e32 v6, vcc, s4, v6
-; TONGA-NEXT: v_sub_u32_e32 v8, vcc, s5, v8
-; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v0
-; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v1
-; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v2
-; TONGA-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4
-; TONGA-NEXT: v_cmp_le_u32_e64 s[2:3], s14, v6
-; TONGA-NEXT: v_cmp_le_u32_e64 s[4:5], s15, v8
-; TONGA-NEXT: v_subrev_u32_e32 v10, vcc, s13, v4
-; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[0:1]
-; TONGA-NEXT: v_subrev_u32_e32 v5, vcc, s14, v6
-; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3]
-; TONGA-NEXT: v_subrev_u32_e32 v7, vcc, s15, v8
-; TONGA-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[4:5]
-; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1]
-; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v0
-; TONGA-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[2:3]
-; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v1
-; TONGA-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[4:5]
-; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v2
-; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s13, v4
-; TONGA-NEXT: v_mul_lo_u32 v4, v3, s17
-; TONGA-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
-; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s14, v5
-; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
-; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s15, v7
-; TONGA-NEXT: s_ashr_i32 s12, s12, 31
-; TONGA-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
-; TONGA-NEXT: v_xor_b32_e32 v0, s12, v0
-; TONGA-NEXT: v_xor_b32_e32 v1, s18, v1
-; TONGA-NEXT: v_xor_b32_e32 v2, s19, v2
-; TONGA-NEXT: v_sub_u32_e32 v4, vcc, s16, v4
-; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, s12, v0
-; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, s18, v1
-; TONGA-NEXT: v_subrev_u32_e32 v2, vcc, s19, v2
-; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v3
-; TONGA-NEXT: v_subrev_u32_e32 v6, vcc, s17, v4
-; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s17, v4
-; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v3
-; TONGA-NEXT: s_xor_b32 s0, s7, s6
-; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s17, v4
-; TONGA-NEXT: s_ashr_i32 s0, s0, 31
-; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; TONGA-NEXT: v_xor_b32_e32 v3, s0, v3
-; TONGA-NEXT: v_subrev_u32_e32 v3, vcc, s0, v3
-; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; TONGA-NEXT: v_cvt_u32_f32_e32 v9, v0
+; TONGA-NEXT: v_cndmask_b32_e64 v0, v10, v13, s[0:1]
+; TONGA-NEXT: v_xor_b32_e32 v0, v0, v8
+; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v8
+; TONGA-NEXT: v_mul_lo_u32 v8, v4, v6
+; TONGA-NEXT: v_add_u32_e32 v12, vcc, 1, v1
+; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v5
+; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v8
+; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3]
+; TONGA-NEXT: v_mul_lo_u32 v10, v10, v9
+; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v4
+; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v2, v6
+; TONGA-NEXT: v_xor_b32_e32 v1, v1, v11
+; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1]
+; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v2, v6
+; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v11
+; TONGA-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1]
+; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v4
+; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6
+; TONGA-NEXT: v_cndmask_b32_e32 v2, v4, v8, vcc
+; TONGA-NEXT: v_mul_hi_u32 v4, v9, v10
+; TONGA-NEXT: v_sub_u32_e32 v6, vcc, 0, v3
+; TONGA-NEXT: v_max_i32_e32 v6, v3, v6
+; TONGA-NEXT: v_add_u32_e32 v4, vcc, v9, v4
+; TONGA-NEXT: v_mul_hi_u32 v4, v6, v4
+; TONGA-NEXT: v_xor_b32_e32 v2, v2, v14
+; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v14
+; TONGA-NEXT: v_mul_lo_u32 v8, v4, v5
+; TONGA-NEXT: v_xor_b32_e32 v3, v3, v7
+; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v4
+; TONGA-NEXT: v_sub_u32_e32 v6, vcc, v6, v8
+; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v6, v5
+; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5
+; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
+; TONGA-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
+; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v4
+; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5
+; TONGA-NEXT: v_ashrrev_i32_e32 v3, 31, v3
+; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
+; TONGA-NEXT: v_xor_b32_e32 v4, v4, v3
+; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v4, v3
+; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; TONGA-NEXT: s_endpgm
;
; GFX9-LABEL: sdiv_v4i32:
@@ -2006,7 +1994,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_mul_lo_u32 v1, v3, v2
; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v3
; GCN-NEXT: v_sub_i32_e32 v1, vcc, v5, v1
-; GCN-NEXT: v_subrev_i32_e32 v5, vcc, v2, v1
+; GCN-NEXT: v_sub_i32_e32 v5, vcc, v1, v2
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2
; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
@@ -2014,7 +2002,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2
; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GCN-NEXT: v_xor_b32_e32 v1, v1, v0
-; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v0
; GCN-NEXT: v_bfe_i32 v0, v0, 0, 25
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
@@ -2053,7 +2041,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: v_mul_lo_u32 v1, v3, v2
; TONGA-NEXT: v_add_u32_e32 v4, vcc, 1, v3
; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v5, v1
-; TONGA-NEXT: v_subrev_u32_e32 v5, vcc, v2, v1
+; TONGA-NEXT: v_sub_u32_e32 v5, vcc, v1, v2
; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2
; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
@@ -2061,7 +2049,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2
; TONGA-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; TONGA-NEXT: v_xor_b32_e32 v1, v1, v0
-; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v0, v1
+; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v0
; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 25
; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0
; TONGA-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll
index bbdfc76..da454ee 100644
--- a/llvm/test/CodeGen/AMDGPU/select.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll
@@ -852,19 +852,19 @@ define amdgpu_kernel void @select_v2f16(
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-TRUE16-NEXT: s_mov_b32 s22, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s23, s3
-; GFX11-TRUE16-NEXT: s_mov_b32 s26, s2
-; GFX11-TRUE16-NEXT: s_mov_b32 s27, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s18, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s19, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s26, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s27, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s6, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s7, s3
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s20, s12
; GFX11-TRUE16-NEXT: s_mov_b32 s21, s13
-; GFX11-TRUE16-NEXT: s_mov_b32 s24, s14
-; GFX11-TRUE16-NEXT: s_mov_b32 s25, s15
; GFX11-TRUE16-NEXT: s_mov_b32 s16, s10
; GFX11-TRUE16-NEXT: s_mov_b32 s17, s11
+; GFX11-TRUE16-NEXT: s_mov_b32 s24, s14
+; GFX11-TRUE16-NEXT: s_mov_b32 s25, s15
; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[20:23], 0
; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[16:19], 0
; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[24:27], 0
@@ -874,20 +874,18 @@ define amdgpu_kernel void @select_v2f16(
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1
-; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1.l, v0.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3
+; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e64 s0, v5.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v6.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v7.l, v1.l, s0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, s8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
@@ -1058,21 +1056,18 @@ define amdgpu_kernel void @select_v2f16_imm_a(
; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e64 s0, 0x3900, v3.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.l
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v1.l, s0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, s4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
@@ -1236,21 +1231,18 @@ define amdgpu_kernel void @select_v2f16_imm_b(
; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0x3900, v3.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.l
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v1.l, s0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, s4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
@@ -1402,8 +1394,6 @@ define amdgpu_kernel void @select_v2f16_imm_c(
; GFX11-TRUE16-NEXT: s_mov_b32 s19, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3
-; GFX11-TRUE16-NEXT: s_mov_b32 s22, s2
-; GFX11-TRUE16-NEXT: s_mov_b32 s23, s3
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s16, s8
; GFX11-TRUE16-NEXT: s_mov_b32 s17, s9
@@ -1411,10 +1401,10 @@ define amdgpu_kernel void @select_v2f16_imm_c(
; GFX11-TRUE16-NEXT: s_mov_b32 s13, s7
; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[16:19], 0
; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-TRUE16-NEXT: s_mov_b32 s20, s10
-; GFX11-TRUE16-NEXT: s_mov_b32 s21, s11
+; GFX11-TRUE16-NEXT: s_mov_b32 s12, s10
+; GFX11-TRUE16-NEXT: s_mov_b32 s13, s11
; GFX11-TRUE16-NEXT: s_mov_b32 s1, s5
-; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[12:15], 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -1425,12 +1415,9 @@ define amdgpu_kernel void @select_v2f16_imm_c(
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e64 s0, v4.l, v3.l
; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x3800, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3900, v0.l, s0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x3900, v0.l, s0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, s4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-TRUE16-NEXT: buffer_store_b32 v1, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: select_v2f16_imm_c:
@@ -1581,8 +1568,6 @@ define amdgpu_kernel void @select_v2f16_imm_d(
; GFX11-TRUE16-NEXT: s_mov_b32 s19, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3
-; GFX11-TRUE16-NEXT: s_mov_b32 s22, s2
-; GFX11-TRUE16-NEXT: s_mov_b32 s23, s3
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s16, s8
; GFX11-TRUE16-NEXT: s_mov_b32 s17, s9
@@ -1590,10 +1575,10 @@ define amdgpu_kernel void @select_v2f16_imm_d(
; GFX11-TRUE16-NEXT: s_mov_b32 s13, s7
; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[16:19], 0
; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-TRUE16-NEXT: s_mov_b32 s20, s10
-; GFX11-TRUE16-NEXT: s_mov_b32 s21, s11
+; GFX11-TRUE16-NEXT: s_mov_b32 s12, s10
+; GFX11-TRUE16-NEXT: s_mov_b32 s13, s11
; GFX11-TRUE16-NEXT: s_mov_b32 s1, s5
-; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[12:15], 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
@@ -1604,12 +1589,9 @@ define amdgpu_kernel void @select_v2f16_imm_d(
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e64 s0, v4.l, v3.l
; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x3800, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3900, v0.l, s0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x3900, v0.l, s0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, s4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-TRUE16-NEXT: buffer_store_b32 v1, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: select_v2f16_imm_d:
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir b/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir
index 9553fcc..f11fe4a 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir
@@ -59,6 +59,15 @@ body: |
...
---
+name: src_shared_base_to_vcc
+body: |
+ bb.0:
+ ; GFX9-LABEL: name: src_shared_base_to_vcc
+ ; GFX9: $vcc = S_MOV_B64 $src_shared_base
+ $vcc = COPY $src_shared_base
+...
+
+---
name: sgpr96_aligned_src_dst
body: |
bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll
index 5944342..bbd1793 100644
--- a/llvm/test/CodeGen/AMDGPU/srem.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem.ll
@@ -467,28 +467,28 @@ define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_readfirstlane_b32 s2, v2
; GCN-NEXT: s_abs_i32 s2, s2
; GCN-NEXT: v_cvt_f32_u32_e32 v2, s2
-; GCN-NEXT: v_readfirstlane_b32 s3, v0
+; GCN-NEXT: v_readfirstlane_b32 s4, v0
; GCN-NEXT: s_sub_i32 s6, 0, s2
-; GCN-NEXT: s_ashr_i32 s5, s3, 31
+; GCN-NEXT: s_ashr_i32 s5, s4, 31
; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2
-; GCN-NEXT: s_abs_i32 s3, s3
-; GCN-NEXT: v_readfirstlane_b32 s4, v3
+; GCN-NEXT: s_abs_i32 s4, s4
+; GCN-NEXT: v_readfirstlane_b32 s3, v3
; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v2
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
; GCN-NEXT: v_readfirstlane_b32 s7, v0
; GCN-NEXT: s_mul_i32 s6, s6, s7
; GCN-NEXT: s_mul_hi_u32 s6, s7, s6
; GCN-NEXT: s_add_i32 s7, s7, s6
-; GCN-NEXT: s_mul_hi_u32 s6, s3, s7
+; GCN-NEXT: s_mul_hi_u32 s6, s4, s7
; GCN-NEXT: s_mul_i32 s6, s6, s2
-; GCN-NEXT: s_sub_i32 s3, s3, s6
-; GCN-NEXT: s_sub_i32 s6, s3, s2
-; GCN-NEXT: s_cmp_ge_u32 s3, s2
-; GCN-NEXT: s_cselect_b32 s3, s6, s3
-; GCN-NEXT: s_sub_i32 s6, s3, s2
-; GCN-NEXT: s_cmp_ge_u32 s3, s2
-; GCN-NEXT: s_cselect_b32 s2, s6, s3
-; GCN-NEXT: s_abs_i32 s3, s4
+; GCN-NEXT: s_sub_i32 s4, s4, s6
+; GCN-NEXT: s_sub_i32 s6, s4, s2
+; GCN-NEXT: s_cmp_ge_u32 s4, s2
+; GCN-NEXT: s_cselect_b32 s4, s6, s4
+; GCN-NEXT: s_sub_i32 s6, s4, s2
+; GCN-NEXT: s_cmp_ge_u32 s4, s2
+; GCN-NEXT: s_cselect_b32 s2, s6, s4
+; GCN-NEXT: s_abs_i32 s3, s3
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s3
; GCN-NEXT: s_xor_b32 s2, s2, s5
; GCN-NEXT: s_sub_i32 s7, 0, s3
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
index 4a6202ea..6daea57 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
@@ -788,12 +788,10 @@ define amdgpu_ps <2 x half> @s_constained_fsub_v2f16_fpexcept_strict(<2 x half>
;
; GFX11-SDAG-TRUE16-LABEL: s_constained_fsub_v2f16_fpexcept_strict:
; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e64 v0.l, s2, s3
; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s0, s3, 16
; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s1, s2, 16
-; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e64 v1.l, s1, s0
-; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e64 v0.l, s2, s3
+; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e64 v0.h, s1, s0
; GFX11-SDAG-TRUE16-NEXT: ; return to shader part epilog
;
; GFX11-SDAG-FAKE16-LABEL: s_constained_fsub_v2f16_fpexcept_strict:
diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
index cd1c532..6a273e5 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -813,7 +813,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
@@ -825,11 +825,9 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX11-TRUE16-NEXT: v_pk_sub_i16 v0, v1, v0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v2, 16, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.h
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1
; GFX11-TRUE16-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
index c9b94e0..99b6ab7 100644
--- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
@@ -189,14 +189,11 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg
; SDAG-GFX11-TRUE16-LABEL: basic_smax_smin_sgpr:
; SDAG-GFX11-TRUE16: ; %bb.0:
; SDAG-GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; SDAG-GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; SDAG-GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-TRUE16-NEXT: v_med3_i16 v0.l, s2, 0, 0xff
-; SDAG-GFX11-TRUE16-NEXT: v_med3_i16 v1.l, s3, 0, 0xff
-; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SDAG-GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; SDAG-GFX11-TRUE16-NEXT: global_store_b32 v2, v0, s[0:1]
+; SDAG-GFX11-TRUE16-NEXT: v_med3_i16 v1.l, s2, 0, 0xff
+; SDAG-GFX11-TRUE16-NEXT: v_med3_i16 v1.h, s3, 0, 0xff
+; SDAG-GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
; SDAG-GFX11-TRUE16-NEXT: s_endpgm
;
; SDAG-GFX11-FAKE16-LABEL: basic_smax_smin_sgpr:
@@ -215,14 +212,11 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg
; SDAG-GFX12-TRUE16-LABEL: basic_smax_smin_sgpr:
; SDAG-GFX12-TRUE16: ; %bb.0:
; SDAG-GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; SDAG-GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; SDAG-GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; SDAG-GFX12-TRUE16-NEXT: v_med3_i16 v0.l, s2, 0, 0xff
-; SDAG-GFX12-TRUE16-NEXT: v_med3_i16 v1.l, s3, 0, 0xff
-; SDAG-GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SDAG-GFX12-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; SDAG-GFX12-TRUE16-NEXT: global_store_b32 v2, v0, s[0:1]
+; SDAG-GFX12-TRUE16-NEXT: v_med3_i16 v1.l, s2, 0, 0xff
+; SDAG-GFX12-TRUE16-NEXT: v_med3_i16 v1.h, s3, 0, 0xff
+; SDAG-GFX12-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
; SDAG-GFX12-TRUE16-NEXT: s_endpgm
;
; SDAG-GFX12-FAKE16-LABEL: basic_smax_smin_sgpr:
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll
index 801324e..dfc59f6 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll
@@ -1023,10 +1023,11 @@ define i16 @test_vector_reduce_and_v2i16(<2 x i16> %v) {
; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_and_v2i16:
; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s0
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, s0, 16, v1
-; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v2
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_and_v2i16:
@@ -1052,10 +1053,11 @@ define i16 @test_vector_reduce_and_v2i16(<2 x i16> %v) {
; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s0
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, s0, 16, v1
-; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v2
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_and_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
index 98919f5..4d5ade4 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
@@ -1024,10 +1024,11 @@ define i16 @test_vector_reduce_mul_v2i16(<2 x i16> %v) {
; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_mul_v2i16:
; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, s0, 16, v1
-; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v2i16:
@@ -1053,10 +1054,11 @@ define i16 @test_vector_reduce_mul_v2i16(<2 x i16> %v) {
; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, s0, 16, v1
-; GFX12-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX12-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v2i16:
@@ -1298,11 +1300,12 @@ define i16 @test_vector_reduce_mul_v4i16(<4 x i16> %v) {
; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v1
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, s0, 16, v1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v4i16:
@@ -1331,11 +1334,12 @@ define i16 @test_vector_reduce_mul_v4i16(<4 x i16> %v) {
; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v1
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, s0, 16, v1
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v4i16:
@@ -1468,12 +1472,13 @@ define i16 @test_vector_reduce_mul_v8i16(<8 x i16> %v) {
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v1, v1, v3
; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v2
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v1
-; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, s0, 16, v1
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v8i16:
@@ -1509,12 +1514,13 @@ define i16 @test_vector_reduce_mul_v8i16(<8 x i16> %v) {
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v1, v1, v3
; GFX12-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v2
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v1
-; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, s0, 16, v1
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX12-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v8i16:
@@ -1706,12 +1712,13 @@ define i16 @test_vector_reduce_mul_v16i16(<16 x i16> %v) {
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v1, v1, v3
; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v2
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v1
-; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, s0, 16, v1
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v16i16:
@@ -1762,12 +1769,13 @@ define i16 @test_vector_reduce_mul_v16i16(<16 x i16> %v) {
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v1, v1, v3
; GFX12-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v2
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v1
-; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, s0, 16, v1
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX12-SDAG-TRUE16-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v16i16:
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll
index bdb1c22..9e033f5 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll
@@ -1046,10 +1046,11 @@ define i16 @test_vector_reduce_or_v2i16(<2 x i16> %v) {
; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_or_v2i16:
; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, s0, 16, v1
-; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_or_v2i16:
@@ -1075,10 +1076,11 @@ define i16 @test_vector_reduce_or_v2i16(<2 x i16> %v) {
; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, s0, 16, v1
-; GFX12-SDAG-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX12-SDAG-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_or_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll
index cf344ea..166e6c4 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll
@@ -992,10 +992,11 @@ define i16 @test_vector_reduce_xor_v2i16(<2 x i16> %v) {
; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_xor_v2i16:
; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, s0, 16, v1
-; GFX11-SDAG-TRUE16-NEXT: v_xor_b32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_xor_v2i16:
@@ -1021,10 +1022,11 @@ define i16 @test_vector_reduce_xor_v2i16(<2 x i16> %v) {
; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, s0, 16, v1
-; GFX12-SDAG-TRUE16-NEXT: v_xor_b32_e32 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX12-SDAG-TRUE16-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_xor_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll b/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll
index 07e9325..5045540 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll
@@ -455,10 +455,7 @@ define <2 x i16> @shuffle_v2i16_rebroadcast(ptr addrspace(1) %arg0) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: shuffle_v2i16_rebroadcast:
@@ -499,10 +496,8 @@ define <4 x i16> @shuffle_v4i16_rebroadcast(ptr addrspace(1) %arg0) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -550,10 +545,8 @@ define <8 x i16> @shuffle_v8i16_rebroadcast(ptr addrspace(1) %arg0) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
@@ -613,10 +606,8 @@ define <16 x i16> @shuffle_v16i16_rebroadcast(ptr addrspace(1) %arg0) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
@@ -700,10 +691,8 @@ define <32 x i16> @shuffle_v32i16_rebroadcast(ptr addrspace(1) %arg0) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index b01e92d..6bf6d54 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -1288,9 +1288,8 @@ define <4 x i16> @shuffle_v4i16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) %a
; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v3, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -2571,10 +2570,9 @@ define <2 x i16> @i16_hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1)
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: i16_hi16low16bits:
@@ -2626,14 +2624,10 @@ define <2 x i16> @i16_hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
; GFX11-TRUE16-LABEL: i16_hi16bits:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_b32 v2, v[2:3], off
-; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
+; GFX11-TRUE16-NEXT: global_load_b32 v1, v[0:1], off
+; GFX11-TRUE16-NEXT: global_load_b32 v0, v[2:3], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: i16_hi16bits:
diff --git a/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir b/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir
index c8fee5d..7cbe5de 100644
--- a/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir
+++ b/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir
@@ -119,9 +119,10 @@ body: |
; CHECK: [[R32:%[0-9]+]]:_(s32) = G_SUB [[COUNT]], [[BITDIFF]]
%2(s16) = G_CTLZ %1
- ; CHECK: [[SHIFTEDR:%[0-9]+]]:_(s32) = G_SHL [[R32]], [[BITDIFF]]
- ; CHECK: [[R:%[0-9]+]]:_(s32) = G_ASHR [[SHIFTEDR]], [[BITDIFF]]
- ; CHECK: $r0 = COPY [[R]]
+ ; LIBCALLS: [[SHIFTEDR:%[0-9]+]]:_(s32) = G_SHL [[R32]], [[BITDIFF]]
+ ; LIBCALLS: [[R:%[0-9]+]]:_(s32) = G_ASHR [[SHIFTEDR]], [[BITDIFF]]
+ ; LIBCALLS: $r0 = COPY [[R]]
+ ; CLZ: $r0 = COPY [[R32]]
%3(s32) = G_SEXT %2(s16)
$r0 = COPY %3(s32)
BX_RET 14, $noreg, implicit $r0
diff --git a/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll b/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll
index a2d6ca9..972a470 100644
--- a/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll
+++ b/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll
@@ -27,7 +27,7 @@ entry:
!1 = !{i64 0, !"_ZTSFivE.generalized"}
!2 = !{i64 0, !"_ZTSFviE.generalized"}
-; CHECK: .section .callgraph,"o",%progbits,.text
+; CHECK: .section .llvm.callgraph,"o",%progbits,.text
;; Version
; CHECK-NEXT: .byte 0
;; Flags -- Potential indirect target so LSB is set to 1. Other bits are 0.
diff --git a/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll b/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll
index bf5249e..ec8d5b8 100644
--- a/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll
+++ b/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll
@@ -1,8 +1,8 @@
;; Test if temporary labels are generated for each indirect callsite.
-;; Test if the .callgraph section contains the MD5 hash of callees' type (type id)
+;; Test if the .llvm.callgraph section contains the MD5 hash of callees' type (type id)
;; is correctly paired with its corresponding temporary label generated for indirect
;; call sites annotated with !callee_type metadata.
-;; Test if the .callgraph section contains unique direct callees.
+;; Test if the .llvm.callgraph section contains unique direct callees.
; RUN: llc -mtriple=arm-unknown-linux --call-graph-section -o - < %s | FileCheck %s
@@ -36,7 +36,7 @@ entry:
!4 = !{!5}
!5 = !{i64 0, !"_ZTSFPvS_E.generalized"}
-; CHECK: .section .callgraph,"o",%progbits,.text
+; CHECK: .section .llvm.callgraph,"o",%progbits,.text
;; Version
; CHECK-NEXT: .byte 0
;; Flags
diff --git a/llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll b/llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll
index d577603..8036004 100644
--- a/llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll
+++ b/llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll
@@ -1,7 +1,7 @@
-;; Tests that we store the type identifiers in .callgraph section of the object file for tailcalls.
+;; Tests that we store the type identifiers in .llvm.callgraph section of the object file for tailcalls.
; RUN: llc -mtriple=arm-unknown-linux --call-graph-section -filetype=obj -o - < %s | \
-; RUN: llvm-readelf -x .callgraph - | FileCheck %s
+; RUN: llvm-readelf -x .llvm.callgraph - | FileCheck %s
define i32 @check_tailcall(ptr %func, i8 %x) !type !0 {
entry:
@@ -27,7 +27,7 @@ declare !type !2 i32 @bar(i8 signext)
!2 = !{i64 0, !"_ZTSFicE.generalized"}
!3 = !{i64 0, !"_ZTSFiiE.generalized"}
-; CHECK: Hex dump of section '.callgraph':
+; CHECK: Hex dump of section '.llvm.callgraph':
; CHECK-NEXT: 0x00000000 00050000 00008e19 0b7f3326 e3000154
; CHECK-NEXT: 0x00000010 86bc5981 4b8e3000 05100000 00a150b8
;; Verify that the type id 0x308e4b8159bc8654 is in section.
diff --git a/llvm/test/CodeGen/ARM/call-graph-section.ll b/llvm/test/CodeGen/ARM/call-graph-section.ll
index 928a1067..167cc6f 100644
--- a/llvm/test/CodeGen/ARM/call-graph-section.ll
+++ b/llvm/test/CodeGen/ARM/call-graph-section.ll
@@ -1,7 +1,7 @@
-;; Tests that we store the type identifiers in .callgraph section of the object file.
+;; Tests that we store the type identifiers in .llvm.callgraph section of the object file.
; RUN: llc -mtriple=arm-unknown-linux --call-graph-section -filetype=obj -o - < %s | \
-; RUN: llvm-readelf -x .callgraph - | FileCheck %s
+; RUN: llvm-readelf -x .llvm.callgraph - | FileCheck %s
declare !type !0 void @foo()
@@ -31,7 +31,7 @@ entry:
;; Make sure following type IDs are in call graph section
;; 0x5eecb3e2444f731f, 0x814b8e305486bc59, 0xf897fd777ade6814
-; CHECK: Hex dump of section '.callgraph':
+; CHECK: Hex dump of section '.llvm.callgraph':
; CHECK-NEXT: 0x00000000 00050000 00000000 00000000 00000324
; CHECK-NEXT: 0x00000010 44f731f5 eecb3e54 86bc5981 4b8e307a
; CHECK-NEXT: 0x00000020 de6814f8 97fd77
diff --git a/llvm/test/CodeGen/ARM/carry.ll b/llvm/test/CodeGen/ARM/carry.ll
index 558e2b0..a652241 100644
--- a/llvm/test/CodeGen/ARM/carry.ll
+++ b/llvm/test/CodeGen/ARM/carry.ll
@@ -1,61 +1,84 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=armv6t2-eabi %s -o - | FileCheck %s
define i64 @f1(i64 %a, i64 %b) {
; CHECK-LABEL: f1:
-; CHECK: subs r
-; CHECK: sbc r
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: subs r0, r0, r2
+; CHECK-NEXT: sbc r1, r1, r3
+; CHECK-NEXT: bx lr
entry:
- %tmp = sub i64 %a, %b
- ret i64 %tmp
+ %tmp = sub i64 %a, %b
+ ret i64 %tmp
}
define i64 @f2(i64 %a, i64 %b) {
; CHECK-LABEL: f2:
-; CHECK: lsl r
-; CHECK: orr r
-; CHECK: rsbs r
-; CHECK: sbc r
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: lsl r1, r1, #1
+; CHECK-NEXT: orr r1, r1, r0, lsr #31
+; CHECK-NEXT: rsbs r0, r2, r0, lsl #1
+; CHECK-NEXT: sbc r1, r1, r3
+; CHECK-NEXT: bx lr
entry:
- %tmp1 = shl i64 %a, 1
- %tmp2 = sub i64 %tmp1, %b
- ret i64 %tmp2
+ %tmp1 = shl i64 %a, 1
+ %tmp2 = sub i64 %tmp1, %b
+ ret i64 %tmp2
}
; add with live carry
define i64 @f3(i32 %al, i32 %bl) {
; CHECK-LABEL: f3:
-; CHECK: adds r
-; CHECK: adc r
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: adds r0, r0, r1
+; CHECK-NEXT: mov r2, #0
+; CHECK-NEXT: adcs r0, r1, #0
+; CHECK-NEXT: adc r1, r2, #0
+; CHECK-NEXT: bx lr
entry:
- ; unsigned wide add
- %aw = zext i32 %al to i64
- %bw = zext i32 %bl to i64
- %cw = add i64 %aw, %bw
- ; ch == carry bit
- %ch = lshr i64 %cw, 32
- %dw = add i64 %ch, %bw
- ret i64 %dw
+ ; unsigned wide add
+ %aw = zext i32 %al to i64
+ %bw = zext i32 %bl to i64
+ %cw = add i64 %aw, %bw
+ ; ch == carry bit
+ %ch = lshr i64 %cw, 32
+ %dw = add i64 %ch, %bw
+ ret i64 %dw
}
; rdar://10073745
define i64 @f4(i64 %x) nounwind readnone {
-entry:
; CHECK-LABEL: f4:
-; CHECK: rsbs r
-; CHECK: rsc r
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: rsbs r0, r0, #0
+; CHECK-NEXT: rsc r1, r1, #0
+; CHECK-NEXT: bx lr
+entry:
%0 = sub nsw i64 0, %x
ret i64 %0
}
; rdar://12559385
define i64 @f5(i32 %vi) {
-entry:
; CHECK-LABEL: f5:
-; CHECK: movw [[REG:r[0-9]+]], #36102
-; CHECK: sbc r{{[0-9]+}}, r{{[0-9]+}}, [[REG]]
- %v0 = zext i32 %vi to i64
- %v1 = xor i64 %v0, -155057456198619
- %v4 = add i64 %v1, 155057456198619
- %v5 = add i64 %v4, %v1
- ret i64 %v5
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: movw r1, #19493
+; CHECK-NEXT: movw r2, #29433
+; CHECK-NEXT: movt r1, #57191
+; CHECK-NEXT: eor r0, r0, r1
+; CHECK-NEXT: movw r3, #46043
+; CHECK-NEXT: movt r2, #65535
+; CHECK-NEXT: adds r0, r0, r0
+; CHECK-NEXT: movw r1, #36102
+; CHECK-NEXT: sbc r2, r2, r1
+; CHECK-NEXT: movt r3, #8344
+; CHECK-NEXT: adds r0, r0, r3
+; CHECK-NEXT: adc r1, r2, r1
+; CHECK-NEXT: bx lr
+entry:
+ %v0 = zext i32 %vi to i64
+ %v1 = xor i64 %v0, -155057456198619
+ %v4 = add i64 %v1, 155057456198619
+ %v5 = add i64 %v4, %v1
+ ret i64 %v5
}
diff --git a/llvm/test/CodeGen/ARM/nnan-fsub.ll b/llvm/test/CodeGen/ARM/nnan-fsub.ll
index 0183908..78dd36f 100644
--- a/llvm/test/CodeGen/ARM/nnan-fsub.ll
+++ b/llvm/test/CodeGen/ARM/nnan-fsub.ll
@@ -1,18 +1,22 @@
-; RUN: llc -mcpu=cortex-a9 < %s | FileCheck -check-prefix=SAFE %s
-; RUN: llc -mcpu=cortex-a9 --enable-no-nans-fp-math < %s | FileCheck -check-prefix=FAST %s
+; RUN: llc -mcpu=cortex-a9 < %s | FileCheck %s
target triple = "armv7-apple-ios"
-; SAFE: test
-; FAST: test
+; CHECK-LABEL: test
define float @test(float %x, float %y) {
entry:
-; SAFE: vmul.f32
-; SAFE: vsub.f32
-; FAST: mov r0, #0
+; CHECK: vmul.f32
+; CHECK-NEXT: vsub.f32
%0 = fmul float %x, %y
%1 = fsub float %0, %0
ret float %1
}
-
+; CHECK-LABEL: test_nnan
+define float @test_nnan(float %x, float %y) {
+entry:
+; CHECK: mov r0, #0
+ %0 = fmul float %x, %y
+ %1 = fsub nnan float %0, %0
+ ret float %1
+}
diff --git a/llvm/test/CodeGen/DirectX/CBufferAccess/memcpy.ll b/llvm/test/CodeGen/DirectX/CBufferAccess/memcpy.ll
index a78fdd5..f1486f97 100644
--- a/llvm/test/CodeGen/DirectX/CBufferAccess/memcpy.ll
+++ b/llvm/test/CodeGen/DirectX/CBufferAccess/memcpy.ll
@@ -74,7 +74,7 @@ entry:
; CHECK: [[UPTO1:%.*]] = insertelement <3 x double> [[UPTO0]], double [[Y]], i32 1
; CHECK: [[UPTO2:%.*]] = insertelement <3 x double> [[UPTO1]], double [[Z]], i32 2
; CHECK: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A2_COPY:%.*]], i32 0
-; CHECK: store <3 x double> [[UPTO2]], ptr [[DEST]], align 32
+; CHECK: store <3 x double> [[UPTO2]], ptr [[DEST]], align 8
; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 5)
; CHECK: [[X:%.*]] = extractvalue { double, double } [[LOAD]], 0
; CHECK: [[Y:%.*]] = extractvalue { double, double } [[LOAD]], 1
@@ -83,9 +83,9 @@ entry:
; CHECK: [[UPTO0:%.*]] = insertelement <3 x double> poison, double [[X]], i32 0
; CHECK: [[UPTO1:%.*]] = insertelement <3 x double> [[UPTO0]], double [[Y]], i32 1
; CHECK: [[UPTO2:%.*]] = insertelement <3 x double> [[UPTO1]], double [[Z]], i32 2
-; CHECK: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A2_COPY]], i32 32
-; CHECK: store <3 x double> [[UPTO2]], ptr [[DEST]], align 32
- call void @llvm.memcpy.p0.p2.i32(ptr align 32 %a2.copy, ptr addrspace(2) align 32 @a2, i32 64, i1 false)
+; CHECK: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A2_COPY]], i32 24
+; CHECK: store <3 x double> [[UPTO2]], ptr [[DEST]], align 8
+ call void @llvm.memcpy.p0.p2.i32(ptr align 32 %a2.copy, ptr addrspace(2) align 32 @a2, i32 48, i1 false)
; CHECK: [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb, align 4
; CHECK: [[LOAD:%.*]] = call { half, half, half, half, half, half, half, half } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 7)
diff --git a/llvm/test/CodeGen/DirectX/CBufferLoadLegacy-errors.ll b/llvm/test/CodeGen/DirectX/CBufferLoadLegacy-errors.ll
index 71dcf11..196560f 100644
--- a/llvm/test/CodeGen/DirectX/CBufferLoadLegacy-errors.ll
+++ b/llvm/test/CodeGen/DirectX/CBufferLoadLegacy-errors.ll
@@ -11,11 +11,11 @@ declare void @f16_user(half)
; CHECK-SAME: in function four64
; CHECK-SAME: Type mismatch between intrinsic and DXIL op
define void @four64() "hlsl.export" {
- %buffer = call target("dx.CBuffer", target("dx.Layout", {double}, 8, 0))
+ %buffer = call target("dx.CBuffer", <{ double }>)
@llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
%load = call {double, double, double, double} @llvm.dx.resource.load.cbufferrow.4(
- target("dx.CBuffer", target("dx.Layout", {double}, 8, 0)) %buffer,
+ target("dx.CBuffer", <{ double }>) %buffer,
i32 0)
%data = extractvalue {double, double, double, double} %load, 0
@@ -28,11 +28,11 @@ define void @four64() "hlsl.export" {
; CHECK-SAME: in function two32
; CHECK-SAME: Type mismatch between intrinsic and DXIL op
define void @two32() "hlsl.export" {
- %buffer = call target("dx.CBuffer", target("dx.Layout", {float}, 4, 0))
+ %buffer = call target("dx.CBuffer", <{ float }>)
@llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
%load = call {float, float} @llvm.dx.resource.load.cbufferrow.2(
- target("dx.CBuffer", target("dx.Layout", {float}, 4, 0)) %buffer,
+ target("dx.CBuffer", <{ float }>) %buffer,
i32 0)
%data = extractvalue {float, float} %load, 0
@@ -41,5 +41,5 @@ define void @two32() "hlsl.export" {
ret void
}
-declare { double, double, double, double } @llvm.dx.resource.load.cbufferrow.4.f64.f64.f64.f64.tdx.CBuffer_tdx.Layout_sl_f64s_8_0tt(target("dx.CBuffer", target("dx.Layout", { double }, 8, 0)), i32)
-declare { float, float } @llvm.dx.resource.load.cbufferrow.2.f32.f32.tdx.CBuffer_tdx.Layout_sl_f32s_4_0tt(target("dx.CBuffer", target("dx.Layout", { float }, 4, 0)), i32)
+declare { double, double, double, double } @llvm.dx.resource.load.cbufferrow.4.f64.f64.f64.f64.tdx.CBuffer_sl_f64st(target("dx.CBuffer", <{ double }>), i32)
+declare { float, float } @llvm.dx.resource.load.cbufferrow.2.f32.f32.tdx.CBuffer_sl_f32st(target("dx.CBuffer", <{ float }>), i32)
diff --git a/llvm/test/CodeGen/DirectX/CBufferLoadLegacy.ll b/llvm/test/CodeGen/DirectX/CBufferLoadLegacy.ll
index d690651..dd40aa8 100644
--- a/llvm/test/CodeGen/DirectX/CBufferLoadLegacy.ll
+++ b/llvm/test/CodeGen/DirectX/CBufferLoadLegacy.ll
@@ -8,12 +8,12 @@ declare void @f16_user(half)
; CHECK-LABEL: define void @loadf32
define void @loadf32() {
- %buffer = call target("dx.CBuffer", target("dx.Layout", {float}, 4, 0))
+ %buffer = call target("dx.CBuffer", <{ float }>)
@llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
; CHECK: [[DATA:%.*]] = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %{{.*}}, i32 0)
%load = call {float, float, float, float} @llvm.dx.resource.load.cbufferrow.4(
- target("dx.CBuffer", target("dx.Layout", {float}, 4, 0)) %buffer,
+ target("dx.CBuffer", <{ float }>) %buffer,
i32 0)
%data = extractvalue {float, float, float, float} %load, 0
@@ -27,12 +27,12 @@ define void @loadf32() {
; CHECK-LABEL: define void @loadf64
define void @loadf64() {
%buffer = call
- target("dx.CBuffer", target("dx.Layout", {double, double, double, double}, 64, 0, 8, 16, 24))
+ target("dx.CBuffer", <{ <4 x double> }>)
@llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
; CHECK: [[DATA:%.*]] = call %dx.types.CBufRet.f64 @dx.op.cbufferLoadLegacy.f64(i32 59, %dx.types.Handle %{{.*}}, i32 1)
%load = call {double, double} @llvm.dx.resource.load.cbufferrow.2(
- target("dx.CBuffer", target("dx.Layout", {double, double, double, double}, 64, 0, 8, 16, 24)) %buffer,
+ target("dx.CBuffer", <{ <4 x double> }>) %buffer,
i32 1)
%data = extractvalue {double, double} %load, 1
@@ -46,12 +46,12 @@ define void @loadf64() {
; CHECK-LABEL: define void @loadf16
define void @loadf16() {
%buffer = call
- target("dx.CBuffer", target("dx.Layout", {half}, 2, 0))
+ target("dx.CBuffer", <{ half }>)
@llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
; CHECK: [[DATA:%.*]] = call %dx.types.CBufRet.f16.8 @dx.op.cbufferLoadLegacy.f16(i32 59, %dx.types.Handle %{{.*}}, i32 0)
%load = call {half, half, half, half, half, half, half, half} @llvm.dx.resource.load.cbufferrow.8(
- target("dx.CBuffer", target("dx.Layout", {half}, 2, 0)) %buffer,
+ target("dx.CBuffer", <{ half }>) %buffer,
i32 0)
%data = extractvalue {half, half, half, half, half, half, half, half} %load, 0
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/PSVResources-order.ll b/llvm/test/CodeGen/DirectX/ContainerData/PSVResources-order.ll
index bcf82a6..5cd67be 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/PSVResources-order.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/PSVResources-order.ll
@@ -18,7 +18,7 @@ define void @main() #0 {
%srv0 = call target("dx.RawBuffer", i8, 0, 0)
@llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0t(
i32 1, i32 8, i32 1, i32 0, ptr null)
- %cbuf = call target("dx.CBuffer", target("dx.Layout", {float}, 4, 0))
+ %cbuf = call target("dx.CBuffer", <{ float }>)
@llvm.dx.resource.handlefrombinding(i32 3, i32 2, i32 1, i32 0, ptr null)
ret void
}
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/PSVResources.ll b/llvm/test/CodeGen/DirectX/ContainerData/PSVResources.ll
index bea0310..d792078 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/PSVResources.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/PSVResources.ll
@@ -14,7 +14,7 @@ define void @main() #0 {
; CHECK: Kind: CBuffer
; CHECK: Flags:
; CHECK: UsedByAtomic64: false
- %cbuf = call target("dx.CBuffer", target("dx.Layout", {float}, 4, 0))
+ %cbuf = call target("dx.CBuffer", <{ float }>)
@llvm.dx.resource.handlefrombinding(i32 3, i32 2, i32 1, i32 0, ptr null)
; ByteAddressBuffer Buf : register(t8, space1)
@@ -94,6 +94,18 @@ define void @main() #0 {
%uav2_2 = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0)
@llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f32_1_0(
i32 4, i32 0, i32 10, i32 5, ptr null)
+
+ ; RWBuffer<float4> UnboundedArray[] : register(u10, space5)
+; CHECK: - Type: UAVTyped
+; CHECK: Space: 5
+; CHECK: LowerBound: 10
+; CHECK: UpperBound: 4294967295
+; CHECK: Kind: TypedBuffer
+; CHECK: Flags:
+; CHECK: UsedByAtomic64: false
+ ; RWBuffer<float4> Buf = BufferArray[100];
+ %uav3 = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0)
+ @llvm.dx.resource.handlefrombinding(i32 5, i32 10, i32 -1, i32 100, ptr null)
ret void
}
diff --git a/llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll b/llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll
index 38f2de2..671fcef 100644
--- a/llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll
+++ b/llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll
@@ -72,7 +72,7 @@ define void @test_bindings() {
; CHECK: call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[BUF5]], %dx.types.ResourceProperties { i32 10, i32 1033 }) #[[#ATTR]]
; cbuffer cb0 : register(b0) { int4 i; float4 f; }
- %cb0 = call target("dx.CBuffer", target("dx.Layout", {<4 x i32>, <4 x float>}, 32, 0, 16))
+ %cb0 = call target("dx.CBuffer", <{ <4 x i32>, <4 x float> }>)
@llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
; CHECK: [[BUF6:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 0, i32 0, i32 0, i8 2 }, i32 0, i1 false) #[[#ATTR]]
; CHECK: call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[BUF6]], %dx.types.ResourceProperties { i32 13, i32 32 }) #[[#ATTR]]
diff --git a/llvm/test/CodeGen/DirectX/ForwardHandleAccesses/cbuffer-access.ll b/llvm/test/CodeGen/DirectX/ForwardHandleAccesses/cbuffer-access.ll
index 26b157f..d674863 100644
--- a/llvm/test/CodeGen/DirectX/ForwardHandleAccesses/cbuffer-access.ll
+++ b/llvm/test/CodeGen/DirectX/ForwardHandleAccesses/cbuffer-access.ll
@@ -4,27 +4,27 @@
%__cblayout_CB2 = type <{ float }>
%struct.Scalars = type { float, i32, i32 }
-@CB.cb = local_unnamed_addr global target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 12, 0, 4, 8)) poison
-@CB2.cb = local_unnamed_addr global target("dx.CBuffer", target("dx.Layout", %__cblayout_CB2, 4, 0)) poison
+@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison
+@CB2.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB2) poison
define void @main() local_unnamed_addr #1 {
entry:
; CHECK: [[CB:%.*]] = tail call target({{.*}}) @llvm.dx.resource.handlefrombinding
- %h = tail call target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 12, 0, 4, 8)) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
- store target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 12, 0, 4, 8)) %h, ptr @CB.cb, align 4
+ %h = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
+ store target("dx.CBuffer", %__cblayout_CB) %h, ptr @CB.cb, align 4
%_ZL3Out_h.i.i = tail call target("dx.RawBuffer", %struct.Scalars, 1, 0) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
; CHECK-NOT: load target({{.*}}), ptr @CB.cb
- %cb = load target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 12, 0, 4, 8)), ptr @CB.cb, align 4
+ %cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 4
; CHECK: call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target({{.*}}) [[CB]], i32 0)
- %0 = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4(target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 12, 0, 4, 8)) %cb, i32 0)
+ %0 = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4(target("dx.CBuffer", %__cblayout_CB) %cb, i32 0)
%1 = extractvalue { float, float, float, float } %0, 0
call void @llvm.dx.resource.store.rawbuffer(target("dx.RawBuffer", %struct.Scalars, 1, 0) %_ZL3Out_h.i.i, i32 0, i32 0, float %1)
-
+
; CHECK: [[CB2:%.*]] = tail call target({{.*}}) @llvm.dx.resource.handlefromimplicitbinding
- %h2 = tail call target("dx.CBuffer", target("dx.Layout", %__cblayout_CB2, 4, 0)) @llvm.dx.resource.handlefromimplicitbinding(i32 100, i32 0, i32 1, i32 0, ptr null)
- store target("dx.CBuffer", target("dx.Layout", %__cblayout_CB2, 4, 0)) %h2, ptr @CB2.cb, align 4
+ %h2 = tail call target("dx.CBuffer", %__cblayout_CB2) @llvm.dx.resource.handlefromimplicitbinding(i32 100, i32 0, i32 1, i32 0, ptr null)
+ store target("dx.CBuffer", %__cblayout_CB2) %h2, ptr @CB2.cb, align 4
; CHECK-NOT: load target({{.*}}), ptr @CB2.cb
- %cb2 = load target("dx.CBuffer", target("dx.Layout", %__cblayout_CB2, 4, 0)), ptr @CB2.cb, align 4
+ %cb2 = load target("dx.CBuffer", %__cblayout_CB2), ptr @CB2.cb, align 4
ret void
}
diff --git a/llvm/test/CodeGen/DirectX/Metadata/cbuffer_metadata.ll b/llvm/test/CodeGen/DirectX/Metadata/cbuffer-layouttype.ll
index 7ba2ed2..85952c9 100644
--- a/llvm/test/CodeGen/DirectX/Metadata/cbuffer_metadata.ll
+++ b/llvm/test/CodeGen/DirectX/Metadata/cbuffer-layouttype.ll
@@ -1,3 +1,6 @@
+; TODO: Remove this test once we've updated the frontend to use explicit
+; padding. The cbuffer-metadata.ll test covers the newer logic.
+
; RUN: opt -S -dxil-translate-metadata < %s | FileCheck %s
; RUN: opt -S --passes="dxil-pretty-printer" < %s 2>&1 | FileCheck %s --check-prefix=PRINT
; RUN: llc %s --filetype=asm -o - < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,PRINT
@@ -19,11 +22,11 @@ target triple = "dxil-pc-shadermodel6.6-compute"
; PRINT:; Resource Bindings:
; PRINT-NEXT:;
-; PRINT-NEXT:; Name Type Format Dim ID HLSL Bind Count
-; PRINT-NEXT:; ------------------------------ ---------- ------- ----------- ------- -------------- ------
-; PRINT-NEXT:; CB1 cbuffer NA NA CB0 cb0 1
-; PRINT-NEXT:; CB2 cbuffer NA NA CB1 cb1 1
-; PRINT-NEXT:; MyConstants cbuffer NA NA CB2 cb5,space15 1
+; PRINT-NEXT:; Name Type Format Dim ID HLSL Bind Count
+; PRINT-NEXT:; ----
+; PRINT-NEXT:; CB1 cbuffer NA NA CB0 cb0 1
+; PRINT-NEXT:; CB2 cbuffer NA NA CB1 cb1 1
+; PRINT-NEXT:; MyConstants cbuffer NA NA CB2 cb5,space15 1
define void @test() #0 {
diff --git a/llvm/test/CodeGen/DirectX/Metadata/cbuffer-metadata.ll b/llvm/test/CodeGen/DirectX/Metadata/cbuffer-metadata.ll
new file mode 100644
index 0000000..6b90e17
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/Metadata/cbuffer-metadata.ll
@@ -0,0 +1,89 @@
+; RUN: opt -S -dxil-translate-metadata < %s | FileCheck %s
+; RUN: opt -S --passes="dxil-pretty-printer" < %s 2>&1 | FileCheck %s --check-prefix=PRINT
+; RUN: llc %s --filetype=asm -o - < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,PRINT
+
+target triple = "dxil-pc-shadermodel6.6-compute"
+
+%__cblayout_CB1 = type <{ float, i32, double, <2 x i32> }>
+@CB1.cb = global target("dx.CBuffer", %__cblayout_CB1) poison
+@CB1.str = private unnamed_addr constant [4 x i8] c"CB1\00", align 1
+
+%__cblayout_CB2 = type <{ float, target("dx.Padding", 4), double, float, half, i16, i64, i32 }>
+@CB2.cb = global target("dx.CBuffer", %__cblayout_CB2) poison
+@CB2.str = private unnamed_addr constant [4 x i8] c"CB2\00", align 1
+
+%__cblayout_MyConstants = type <{
+ double, target("dx.Padding", 8),
+ <3 x float>, float,
+ <3 x double>, half, target("dx.Padding", 6),
+ <2 x double>,
+ float, <3 x half>, <3 x half>
+}>
+@MyConstants.cb = global target("dx.CBuffer", %__cblayout_MyConstants) poison
+@MyConstants.str = private unnamed_addr constant [12 x i8] c"MyConstants\00", align 1
+
+; PRINT:; Resource Bindings:
+; PRINT-NEXT:;
+; PRINT-NEXT:; Name Type Format Dim ID HLSL Bind Count
+; PRINT-NEXT:; ----
+; PRINT-NEXT:; CB1 cbuffer NA NA CB0 cb0 1
+; PRINT-NEXT:; CB2 cbuffer NA NA CB1 cb1 1
+; PRINT-NEXT:; MyConstants cbuffer NA NA CB2 cb5,space15 1
+
+define void @test() #0 {
+
+ ; cbuffer CB1 : register(b0) {
+ ; float a;
+ ; int b;
+ ; double c;
+ ; int2 d;
+ ; }
+ %CB1.cb_h = call target("dx.CBuffer", %__cblayout_CB1)
+ @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr @CB1.str)
+
+ ; cbuffer CB2 : register(b0) {
+ ; float a;
+ ; double b;
+ ; float c;
+ ; half d;
+ ; uint16_t e;
+ ; int64_t f;
+ ; int g;
+ ;}
+ %CB2.cb_h = call target("dx.CBuffer", %__cblayout_CB2)
+ @llvm.dx.resource.handlefrombinding(i32 0, i32 1, i32 1, i32 0, ptr @CB2.str)
+
+ ; cbuffer CB3 : register(b5) {
+ ; double B0;
+ ; float3 B1;
+ ; float B2;
+ ; double3 B3;
+ ; half B4;
+ ; double2 B5;
+ ; float B6;
+ ; half3 B7;
+ ; half3 B8;
+ ; }
+ %CB3.cb_h = call target("dx.CBuffer", %__cblayout_MyConstants)
+ @llvm.dx.resource.handlefrombinding(i32 15, i32 5, i32 1, i32 0, ptr @MyConstants.str)
+
+ ret void
+}
+
+attributes #0 = { noinline nounwind "hlsl.shader"="compute" }
+
+; CHECK: %CBuffer.CB1 = type { { float, i32, double, <2 x i32> } }
+; CHECK: %CBuffer.CB2 = type { { float, double, float, half, i16, i64, i32 } }
+; CHECK: %CBuffer.MyConstants = type { { double, <3 x float>, float, <3 x double>, half, <2 x double>, float, <3 x half>, <3 x half> } }
+
+; CHECK: @CB1 = external constant %CBuffer.CB1
+; CHECK: @CB2 = external constant %CBuffer.CB2
+; CHECK: @MyConstants = external constant %CBuffer.MyConstants
+
+; CHECK: !dx.resources = !{[[ResList:[!][0-9]+]]}
+
+; CHECK: [[ResList]] = !{null, null, [[CBList:[!][0-9]+]], null}
+; CHECK: [[CBList]] = !{![[CB1:[0-9]+]], ![[CB2:[0-9]+]], ![[MYCONSTANTS:[0-9]+]]}
+; CHECK: ![[CB1]] = !{i32 0, ptr @CB1, !"CB1", i32 0, i32 0, i32 1, i32 24, null}
+; CHECK: ![[CB2]] = !{i32 1, ptr @CB2, !"CB2", i32 0, i32 1, i32 1, i32 36, null}
+; CHECK: ![[MYCONSTANTS]] = !{i32 2, ptr @MyConstants, !"MyConstants", i32 15, i32 5, i32 1, i32 96, null}
diff --git a/llvm/test/CodeGen/DirectX/Metadata/cbuffer-only.ll b/llvm/test/CodeGen/DirectX/Metadata/cbuffer-only.ll
index e2a1c09..0b454c1 100644
--- a/llvm/test/CodeGen/DirectX/Metadata/cbuffer-only.ll
+++ b/llvm/test/CodeGen/DirectX/Metadata/cbuffer-only.ll
@@ -7,7 +7,7 @@
target triple = "dxil-pc-shadermodel6.6-compute"
define void @cbuffer_is_only_binding() {
- %cbuf = call target("dx.CBuffer", target("dx.Layout", {float}, 4, 0))
+ %cbuf = call target("dx.CBuffer", <{ float }>)
@llvm.dx.resource.handlefrombinding(i32 1, i32 8, i32 1, i32 0, ptr null)
; CHECK: %CBuffer = type { float }
diff --git a/llvm/test/CodeGen/DirectX/bufferGetDimensions.ll b/llvm/test/CodeGen/DirectX/bufferGetDimensions.ll
new file mode 100644
index 0000000..ff03bf1
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/bufferGetDimensions.ll
@@ -0,0 +1,16 @@
+; RUN: opt -S -dxil-op-lower %s | FileCheck %s
+
+target triple = "dxil-pc-shadermodel6.6-compute"
+
+define i32 @test_getdimensions_no_mips() {
+ ; CHECK: %[[HANDLE:.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217,
+ ; CHECK-NEXT: %[[ANNOT_HANDLE:.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %[[HANDLE]]
+ %handle = call target("dx.TypedBuffer", <4 x float>, 0, 0, 0) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
+
+ ; CHECK-NEXT: %[[RETVAL:.*]] = call %dx.types.Dimensions @dx.op.getDimensions(i32 72, %dx.types.Handle %[[ANNOT_HANDLE]], i32 undef)
+ ; CHECK-NEXT: %[[DIM:.*]] = extractvalue %dx.types.Dimensions %[[RETVAL]], 0
+ %1 = call i32 @llvm.dx.resource.getdimensions.x(target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %handle)
+
+ ; CHECK-NEXT: ret i32 %[[DIM]]
+ ret i32 %1
+}
diff --git a/llvm/test/CodeGen/Hexagon/swp-many-stores.mir b/llvm/test/CodeGen/Hexagon/swp-many-stores.mir
new file mode 100644
index 0000000..bf14dcf
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-many-stores.mir
@@ -0,0 +1,88 @@
+# RUN: llc -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null -pipeliner-max-num-stores=5 2>&1 | FileCheck %s
+# REQUIRES: asserts
+
+# This loop has six stores, which exceeds the limit set by
+# `pipeliner-max-num-stores`.
+
+# CHECK: Too many stores
+
+--- |
+ target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+ target triple = "hexagon-unknown-linux-musl"
+
+ define void @f(ptr %a, i32 %n) #0 {
+ entry:
+ %guard = icmp sgt i32 %n, 0
+ %btc = sub nsw i32 %n, 1
+ br i1 %guard, label %loop.preheader, label %exit
+
+ loop.preheader: ; preds = %entry
+ %0 = add i32 %n, 1
+ %cgep = getelementptr i8, ptr %a, i32 %0
+ br label %loop
+
+ loop: ; preds = %loop.preheader, %loop
+ %lsr.iv = phi ptr [ %cgep, %loop.preheader ], [ %cgep8, %loop ]
+ %i = phi i32 [ %i.dec, %loop ], [ %btc, %loop.preheader ]
+ %cgep7 = getelementptr i8, ptr %lsr.iv, i32 -2
+ store i8 0, ptr %cgep7, align 1
+ %cgep8 = getelementptr i8, ptr %lsr.iv, i32 -1
+ store i8 1, ptr %cgep8, align 1
+ store i8 2, ptr %lsr.iv, align 1
+ %cgep9 = getelementptr i8, ptr %lsr.iv, i32 1
+ store i8 3, ptr %cgep9, align 1
+ %cgep10 = getelementptr i8, ptr %lsr.iv, i32 2
+ store i8 4, ptr %cgep10, align 1
+ %cgep11 = getelementptr i8, ptr %lsr.iv, i32 3
+ store i8 5, ptr %cgep11, align 1
+ %i.dec = sub i32 %i, 1
+ %ec = icmp eq i32 %i.dec, 0
+ br i1 %ec, label %exit, label %loop
+
+ exit: ; preds = %loop, %entry
+ ret void
+ }
+
+ attributes #0 = { "target-cpu"="hexagonv79" }
+...
+---
+name: f
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ successors: %bb.1(0x50000000), %bb.3(0x30000000)
+ liveins: $r0, $r1
+
+ %7:intregs = COPY $r1
+ %6:intregs = COPY $r0
+ %8:predregs = C2_cmpgti %7, 0
+ J2_jumpf %8, %bb.3, implicit-def dead $pc
+ J2_jump %bb.1, implicit-def dead $pc
+
+ bb.1.loop.preheader:
+ successors: %bb.2(0x80000000)
+
+ %0:intregs = A2_addi %7, -1
+ %1:intregs = S4_addaddi %7, %6, 1
+ %10:intregs = A2_tfrsi 0
+ %11:intregs = A2_tfrsi 1
+ %14:intregs = COPY %0
+ J2_loop0r %bb.2, %14, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+
+ bb.2.loop (machine-block-address-taken):
+ successors: %bb.3(0x04000000), %bb.2(0x7c000000)
+
+ %2:intregs = PHI %1, %bb.1, %4, %bb.2
+ S2_storerb_io %2, -2, %10 :: (store (s8) into %ir.cgep7)
+ %4:intregs = A2_addi %2, -1
+ S2_storerb_io %2, -1, %11 :: (store (s8) into %ir.cgep8)
+ S4_storeirb_io %2, 0, 2 :: (store (s8) into %ir.lsr.iv)
+ S4_storeirb_io %2, 1, 3 :: (store (s8) into %ir.cgep9)
+ S4_storeirb_io %2, 2, 4 :: (store (s8) into %ir.cgep10)
+ S4_storeirb_io %2, 3, 5 :: (store (s8) into %ir.cgep11)
+ ENDLOOP0 %bb.2, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+ J2_jump %bb.3, implicit-def dead $pc
+
+ bb.3.exit:
+ PS_jmpret $r31, implicit-def dead $pc
+...
diff --git a/llvm/test/CodeGen/LoongArch/calling-conv-half.ll b/llvm/test/CodeGen/LoongArch/calling-conv-half.ll
index d111cf2..50f7d40 100644
--- a/llvm/test/CodeGen/LoongArch/calling-conv-half.ll
+++ b/llvm/test/CodeGen/LoongArch/calling-conv-half.ll
@@ -284,7 +284,6 @@ define i32 @caller_half_in_fregs() nounwind {
; LA64S-NEXT: addi.d $sp, $sp, -16
; LA64S-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill
; LA64S-NEXT: lu12i.w $a0, -12
-; LA64S-NEXT: lu32i.d $a0, 0
; LA64S-NEXT: movgr2fr.w $fa0, $a0
; LA64S-NEXT: ori $a0, $zero, 1
; LA64S-NEXT: ori $a1, $zero, 2
@@ -326,7 +325,6 @@ define i32 @caller_half_in_fregs() nounwind {
; LA64F-LP64D-NEXT: addi.d $sp, $sp, -16
; LA64F-LP64D-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill
; LA64F-LP64D-NEXT: lu12i.w $a0, -12
-; LA64F-LP64D-NEXT: lu32i.d $a0, 0
; LA64F-LP64D-NEXT: movgr2fr.w $fa0, $a0
; LA64F-LP64D-NEXT: ori $a0, $zero, 1
; LA64F-LP64D-NEXT: ori $a1, $zero, 2
@@ -368,7 +366,6 @@ define i32 @caller_half_in_fregs() nounwind {
; LA64D-LP64D-NEXT: addi.d $sp, $sp, -16
; LA64D-LP64D-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill
; LA64D-LP64D-NEXT: lu12i.w $a0, -12
-; LA64D-LP64D-NEXT: lu32i.d $a0, 0
; LA64D-LP64D-NEXT: movgr2fr.w $fa0, $a0
; LA64D-LP64D-NEXT: ori $a0, $zero, 1
; LA64D-LP64D-NEXT: ori $a1, $zero, 2
@@ -688,32 +685,23 @@ define i32 @caller_half_in_gregs() nounwind {
; LA64S-NEXT: addi.d $sp, $sp, -16
; LA64S-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill
; LA64S-NEXT: lu12i.w $a1, -12
+; LA64S-NEXT: movgr2fr.w $fa1, $a1
; LA64S-NEXT: ori $a0, $a1, 2176
+; LA64S-NEXT: lu12i.w $a2, -13
+; LA64S-NEXT: ori $a2, $a2, 3072
+; LA64S-NEXT: movgr2fr.w $fa0, $a2
; LA64S-NEXT: ori $a2, $a1, 512
-; LA64S-NEXT: ori $a3, $a1, 1024
-; LA64S-NEXT: ori $a4, $a1, 1280
-; LA64S-NEXT: ori $a5, $a1, 1536
-; LA64S-NEXT: ori $a6, $a1, 1792
-; LA64S-NEXT: ori $a7, $a1, 2048
-; LA64S-NEXT: lu32i.d $a1, 0
-; LA64S-NEXT: movgr2fr.w $fa1, $a1
-; LA64S-NEXT: lu12i.w $a1, -13
-; LA64S-NEXT: ori $a1, $a1, 3072
-; LA64S-NEXT: lu32i.d $a1, 0
-; LA64S-NEXT: movgr2fr.w $fa0, $a1
-; LA64S-NEXT: lu32i.d $a2, 0
; LA64S-NEXT: movgr2fr.w $fa2, $a2
-; LA64S-NEXT: lu32i.d $a3, 0
-; LA64S-NEXT: movgr2fr.w $fa3, $a3
-; LA64S-NEXT: lu32i.d $a4, 0
-; LA64S-NEXT: movgr2fr.w $fa4, $a4
-; LA64S-NEXT: lu32i.d $a5, 0
-; LA64S-NEXT: movgr2fr.w $fa5, $a5
-; LA64S-NEXT: lu32i.d $a0, 0
-; LA64S-NEXT: lu32i.d $a6, 0
-; LA64S-NEXT: movgr2fr.w $fa6, $a6
-; LA64S-NEXT: lu32i.d $a7, 0
-; LA64S-NEXT: movgr2fr.w $fa7, $a7
+; LA64S-NEXT: ori $a2, $a1, 1024
+; LA64S-NEXT: movgr2fr.w $fa3, $a2
+; LA64S-NEXT: ori $a2, $a1, 1280
+; LA64S-NEXT: movgr2fr.w $fa4, $a2
+; LA64S-NEXT: ori $a2, $a1, 1536
+; LA64S-NEXT: movgr2fr.w $fa5, $a2
+; LA64S-NEXT: ori $a2, $a1, 1792
+; LA64S-NEXT: movgr2fr.w $fa6, $a2
+; LA64S-NEXT: ori $a1, $a1, 2048
+; LA64S-NEXT: movgr2fr.w $fa7, $a1
; LA64S-NEXT: ori $a1, $zero, 10
; LA64S-NEXT: pcaddu18i $ra, %call36(callee_half_in_gregs)
; LA64S-NEXT: jirl $ra, $ra, 0
@@ -730,22 +718,14 @@ define i32 @caller_half_in_gregs() nounwind {
; LA64F-LP64S-NEXT: lu12i.w $a1, -12
; LA64F-LP64S-NEXT: ori $t0, $a1, 2176
; LA64F-LP64S-NEXT: lu32i.d $t0, 0
+; LA64F-LP64S-NEXT: lu12i.w $a0, -13
+; LA64F-LP64S-NEXT: ori $a0, $a0, 3072
; LA64F-LP64S-NEXT: ori $a2, $a1, 512
; LA64F-LP64S-NEXT: ori $a3, $a1, 1024
; LA64F-LP64S-NEXT: ori $a4, $a1, 1280
; LA64F-LP64S-NEXT: ori $a5, $a1, 1536
; LA64F-LP64S-NEXT: ori $a6, $a1, 1792
; LA64F-LP64S-NEXT: ori $a7, $a1, 2048
-; LA64F-LP64S-NEXT: lu32i.d $a1, 0
-; LA64F-LP64S-NEXT: lu12i.w $a0, -13
-; LA64F-LP64S-NEXT: ori $a0, $a0, 3072
-; LA64F-LP64S-NEXT: lu32i.d $a0, 0
-; LA64F-LP64S-NEXT: lu32i.d $a2, 0
-; LA64F-LP64S-NEXT: lu32i.d $a3, 0
-; LA64F-LP64S-NEXT: lu32i.d $a4, 0
-; LA64F-LP64S-NEXT: lu32i.d $a5, 0
-; LA64F-LP64S-NEXT: lu32i.d $a6, 0
-; LA64F-LP64S-NEXT: lu32i.d $a7, 0
; LA64F-LP64S-NEXT: st.w $t0, $sp, 0
; LA64F-LP64S-NEXT: pcaddu18i $ra, %call36(callee_half_in_gregs)
; LA64F-LP64S-NEXT: jirl $ra, $ra, 0
@@ -758,32 +738,23 @@ define i32 @caller_half_in_gregs() nounwind {
; LA64F-LP64D-NEXT: addi.d $sp, $sp, -16
; LA64F-LP64D-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill
; LA64F-LP64D-NEXT: lu12i.w $a1, -12
+; LA64F-LP64D-NEXT: movgr2fr.w $fa1, $a1
; LA64F-LP64D-NEXT: ori $a0, $a1, 2176
+; LA64F-LP64D-NEXT: lu12i.w $a2, -13
+; LA64F-LP64D-NEXT: ori $a2, $a2, 3072
+; LA64F-LP64D-NEXT: movgr2fr.w $fa0, $a2
; LA64F-LP64D-NEXT: ori $a2, $a1, 512
-; LA64F-LP64D-NEXT: ori $a3, $a1, 1024
-; LA64F-LP64D-NEXT: ori $a4, $a1, 1280
-; LA64F-LP64D-NEXT: ori $a5, $a1, 1536
-; LA64F-LP64D-NEXT: ori $a6, $a1, 1792
-; LA64F-LP64D-NEXT: ori $a7, $a1, 2048
-; LA64F-LP64D-NEXT: lu32i.d $a1, 0
-; LA64F-LP64D-NEXT: movgr2fr.w $fa1, $a1
-; LA64F-LP64D-NEXT: lu12i.w $a1, -13
-; LA64F-LP64D-NEXT: ori $a1, $a1, 3072
-; LA64F-LP64D-NEXT: lu32i.d $a1, 0
-; LA64F-LP64D-NEXT: movgr2fr.w $fa0, $a1
-; LA64F-LP64D-NEXT: lu32i.d $a2, 0
; LA64F-LP64D-NEXT: movgr2fr.w $fa2, $a2
-; LA64F-LP64D-NEXT: lu32i.d $a3, 0
-; LA64F-LP64D-NEXT: movgr2fr.w $fa3, $a3
-; LA64F-LP64D-NEXT: lu32i.d $a4, 0
-; LA64F-LP64D-NEXT: movgr2fr.w $fa4, $a4
-; LA64F-LP64D-NEXT: lu32i.d $a5, 0
-; LA64F-LP64D-NEXT: movgr2fr.w $fa5, $a5
-; LA64F-LP64D-NEXT: lu32i.d $a0, 0
-; LA64F-LP64D-NEXT: lu32i.d $a6, 0
-; LA64F-LP64D-NEXT: movgr2fr.w $fa6, $a6
-; LA64F-LP64D-NEXT: lu32i.d $a7, 0
-; LA64F-LP64D-NEXT: movgr2fr.w $fa7, $a7
+; LA64F-LP64D-NEXT: ori $a2, $a1, 1024
+; LA64F-LP64D-NEXT: movgr2fr.w $fa3, $a2
+; LA64F-LP64D-NEXT: ori $a2, $a1, 1280
+; LA64F-LP64D-NEXT: movgr2fr.w $fa4, $a2
+; LA64F-LP64D-NEXT: ori $a2, $a1, 1536
+; LA64F-LP64D-NEXT: movgr2fr.w $fa5, $a2
+; LA64F-LP64D-NEXT: ori $a2, $a1, 1792
+; LA64F-LP64D-NEXT: movgr2fr.w $fa6, $a2
+; LA64F-LP64D-NEXT: ori $a1, $a1, 2048
+; LA64F-LP64D-NEXT: movgr2fr.w $fa7, $a1
; LA64F-LP64D-NEXT: ori $a1, $zero, 10
; LA64F-LP64D-NEXT: pcaddu18i $ra, %call36(callee_half_in_gregs)
; LA64F-LP64D-NEXT: jirl $ra, $ra, 0
@@ -800,22 +771,14 @@ define i32 @caller_half_in_gregs() nounwind {
; LA64D-LP64S-NEXT: lu12i.w $a1, -12
; LA64D-LP64S-NEXT: ori $t0, $a1, 2176
; LA64D-LP64S-NEXT: lu32i.d $t0, 0
+; LA64D-LP64S-NEXT: lu12i.w $a0, -13
+; LA64D-LP64S-NEXT: ori $a0, $a0, 3072
; LA64D-LP64S-NEXT: ori $a2, $a1, 512
; LA64D-LP64S-NEXT: ori $a3, $a1, 1024
; LA64D-LP64S-NEXT: ori $a4, $a1, 1280
; LA64D-LP64S-NEXT: ori $a5, $a1, 1536
; LA64D-LP64S-NEXT: ori $a6, $a1, 1792
; LA64D-LP64S-NEXT: ori $a7, $a1, 2048
-; LA64D-LP64S-NEXT: lu32i.d $a1, 0
-; LA64D-LP64S-NEXT: lu12i.w $a0, -13
-; LA64D-LP64S-NEXT: ori $a0, $a0, 3072
-; LA64D-LP64S-NEXT: lu32i.d $a0, 0
-; LA64D-LP64S-NEXT: lu32i.d $a2, 0
-; LA64D-LP64S-NEXT: lu32i.d $a3, 0
-; LA64D-LP64S-NEXT: lu32i.d $a4, 0
-; LA64D-LP64S-NEXT: lu32i.d $a5, 0
-; LA64D-LP64S-NEXT: lu32i.d $a6, 0
-; LA64D-LP64S-NEXT: lu32i.d $a7, 0
; LA64D-LP64S-NEXT: st.w $t0, $sp, 0
; LA64D-LP64S-NEXT: pcaddu18i $ra, %call36(callee_half_in_gregs)
; LA64D-LP64S-NEXT: jirl $ra, $ra, 0
@@ -828,32 +791,23 @@ define i32 @caller_half_in_gregs() nounwind {
; LA64D-LP64D-NEXT: addi.d $sp, $sp, -16
; LA64D-LP64D-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill
; LA64D-LP64D-NEXT: lu12i.w $a1, -12
+; LA64D-LP64D-NEXT: movgr2fr.w $fa1, $a1
; LA64D-LP64D-NEXT: ori $a0, $a1, 2176
+; LA64D-LP64D-NEXT: lu12i.w $a2, -13
+; LA64D-LP64D-NEXT: ori $a2, $a2, 3072
+; LA64D-LP64D-NEXT: movgr2fr.w $fa0, $a2
; LA64D-LP64D-NEXT: ori $a2, $a1, 512
-; LA64D-LP64D-NEXT: ori $a3, $a1, 1024
-; LA64D-LP64D-NEXT: ori $a4, $a1, 1280
-; LA64D-LP64D-NEXT: ori $a5, $a1, 1536
-; LA64D-LP64D-NEXT: ori $a6, $a1, 1792
-; LA64D-LP64D-NEXT: ori $a7, $a1, 2048
-; LA64D-LP64D-NEXT: lu32i.d $a1, 0
-; LA64D-LP64D-NEXT: movgr2fr.w $fa1, $a1
-; LA64D-LP64D-NEXT: lu12i.w $a1, -13
-; LA64D-LP64D-NEXT: ori $a1, $a1, 3072
-; LA64D-LP64D-NEXT: lu32i.d $a1, 0
-; LA64D-LP64D-NEXT: movgr2fr.w $fa0, $a1
-; LA64D-LP64D-NEXT: lu32i.d $a2, 0
; LA64D-LP64D-NEXT: movgr2fr.w $fa2, $a2
-; LA64D-LP64D-NEXT: lu32i.d $a3, 0
-; LA64D-LP64D-NEXT: movgr2fr.w $fa3, $a3
-; LA64D-LP64D-NEXT: lu32i.d $a4, 0
-; LA64D-LP64D-NEXT: movgr2fr.w $fa4, $a4
-; LA64D-LP64D-NEXT: lu32i.d $a5, 0
-; LA64D-LP64D-NEXT: movgr2fr.w $fa5, $a5
-; LA64D-LP64D-NEXT: lu32i.d $a0, 0
-; LA64D-LP64D-NEXT: lu32i.d $a6, 0
-; LA64D-LP64D-NEXT: movgr2fr.w $fa6, $a6
-; LA64D-LP64D-NEXT: lu32i.d $a7, 0
-; LA64D-LP64D-NEXT: movgr2fr.w $fa7, $a7
+; LA64D-LP64D-NEXT: ori $a2, $a1, 1024
+; LA64D-LP64D-NEXT: movgr2fr.w $fa3, $a2
+; LA64D-LP64D-NEXT: ori $a2, $a1, 1280
+; LA64D-LP64D-NEXT: movgr2fr.w $fa4, $a2
+; LA64D-LP64D-NEXT: ori $a2, $a1, 1536
+; LA64D-LP64D-NEXT: movgr2fr.w $fa5, $a2
+; LA64D-LP64D-NEXT: ori $a2, $a1, 1792
+; LA64D-LP64D-NEXT: movgr2fr.w $fa6, $a2
+; LA64D-LP64D-NEXT: ori $a1, $a1, 2048
+; LA64D-LP64D-NEXT: movgr2fr.w $fa7, $a1
; LA64D-LP64D-NEXT: ori $a1, $zero, 10
; LA64D-LP64D-NEXT: pcaddu18i $ra, %call36(callee_half_in_gregs)
; LA64D-LP64D-NEXT: jirl $ra, $ra, 0
@@ -1231,28 +1185,20 @@ define i32 @caller_half_on_stack() nounwind {
; LA64S-NEXT: ori $t0, $a0, 3200
; LA64S-NEXT: lu32i.d $t0, 0
; LA64S-NEXT: ori $a1, $a0, 2304
-; LA64S-NEXT: lu32i.d $a1, 0
; LA64S-NEXT: movgr2fr.w $fa0, $a1
; LA64S-NEXT: ori $a1, $a0, 2432
-; LA64S-NEXT: lu32i.d $a1, 0
; LA64S-NEXT: movgr2fr.w $fa1, $a1
; LA64S-NEXT: ori $a1, $a0, 2560
-; LA64S-NEXT: lu32i.d $a1, 0
; LA64S-NEXT: movgr2fr.w $fa2, $a1
; LA64S-NEXT: ori $a1, $a0, 2688
-; LA64S-NEXT: lu32i.d $a1, 0
; LA64S-NEXT: movgr2fr.w $fa3, $a1
; LA64S-NEXT: ori $a1, $a0, 2816
-; LA64S-NEXT: lu32i.d $a1, 0
; LA64S-NEXT: movgr2fr.w $fa4, $a1
; LA64S-NEXT: ori $a1, $a0, 2944
-; LA64S-NEXT: lu32i.d $a1, 0
; LA64S-NEXT: movgr2fr.w $fa5, $a1
; LA64S-NEXT: ori $a1, $a0, 3072
-; LA64S-NEXT: lu32i.d $a1, 0
; LA64S-NEXT: movgr2fr.w $fa6, $a1
; LA64S-NEXT: ori $a0, $a0, 3136
-; LA64S-NEXT: lu32i.d $a0, 0
; LA64S-NEXT: movgr2fr.w $fa7, $a0
; LA64S-NEXT: ori $a0, $zero, 1
; LA64S-NEXT: ori $a1, $zero, 2
@@ -1323,28 +1269,20 @@ define i32 @caller_half_on_stack() nounwind {
; LA64F-LP64D-NEXT: ori $t0, $a0, 3200
; LA64F-LP64D-NEXT: lu32i.d $t0, 0
; LA64F-LP64D-NEXT: ori $a1, $a0, 2304
-; LA64F-LP64D-NEXT: lu32i.d $a1, 0
; LA64F-LP64D-NEXT: movgr2fr.w $fa0, $a1
; LA64F-LP64D-NEXT: ori $a1, $a0, 2432
-; LA64F-LP64D-NEXT: lu32i.d $a1, 0
; LA64F-LP64D-NEXT: movgr2fr.w $fa1, $a1
; LA64F-LP64D-NEXT: ori $a1, $a0, 2560
-; LA64F-LP64D-NEXT: lu32i.d $a1, 0
; LA64F-LP64D-NEXT: movgr2fr.w $fa2, $a1
; LA64F-LP64D-NEXT: ori $a1, $a0, 2688
-; LA64F-LP64D-NEXT: lu32i.d $a1, 0
; LA64F-LP64D-NEXT: movgr2fr.w $fa3, $a1
; LA64F-LP64D-NEXT: ori $a1, $a0, 2816
-; LA64F-LP64D-NEXT: lu32i.d $a1, 0
; LA64F-LP64D-NEXT: movgr2fr.w $fa4, $a1
; LA64F-LP64D-NEXT: ori $a1, $a0, 2944
-; LA64F-LP64D-NEXT: lu32i.d $a1, 0
; LA64F-LP64D-NEXT: movgr2fr.w $fa5, $a1
; LA64F-LP64D-NEXT: ori $a1, $a0, 3072
-; LA64F-LP64D-NEXT: lu32i.d $a1, 0
; LA64F-LP64D-NEXT: movgr2fr.w $fa6, $a1
; LA64F-LP64D-NEXT: ori $a0, $a0, 3136
-; LA64F-LP64D-NEXT: lu32i.d $a0, 0
; LA64F-LP64D-NEXT: movgr2fr.w $fa7, $a0
; LA64F-LP64D-NEXT: ori $a0, $zero, 1
; LA64F-LP64D-NEXT: ori $a1, $zero, 2
@@ -1415,28 +1353,20 @@ define i32 @caller_half_on_stack() nounwind {
; LA64D-LP64D-NEXT: ori $t0, $a0, 3200
; LA64D-LP64D-NEXT: lu32i.d $t0, 0
; LA64D-LP64D-NEXT: ori $a1, $a0, 2304
-; LA64D-LP64D-NEXT: lu32i.d $a1, 0
; LA64D-LP64D-NEXT: movgr2fr.w $fa0, $a1
; LA64D-LP64D-NEXT: ori $a1, $a0, 2432
-; LA64D-LP64D-NEXT: lu32i.d $a1, 0
; LA64D-LP64D-NEXT: movgr2fr.w $fa1, $a1
; LA64D-LP64D-NEXT: ori $a1, $a0, 2560
-; LA64D-LP64D-NEXT: lu32i.d $a1, 0
; LA64D-LP64D-NEXT: movgr2fr.w $fa2, $a1
; LA64D-LP64D-NEXT: ori $a1, $a0, 2688
-; LA64D-LP64D-NEXT: lu32i.d $a1, 0
; LA64D-LP64D-NEXT: movgr2fr.w $fa3, $a1
; LA64D-LP64D-NEXT: ori $a1, $a0, 2816
-; LA64D-LP64D-NEXT: lu32i.d $a1, 0
; LA64D-LP64D-NEXT: movgr2fr.w $fa4, $a1
; LA64D-LP64D-NEXT: ori $a1, $a0, 2944
-; LA64D-LP64D-NEXT: lu32i.d $a1, 0
; LA64D-LP64D-NEXT: movgr2fr.w $fa5, $a1
; LA64D-LP64D-NEXT: ori $a1, $a0, 3072
-; LA64D-LP64D-NEXT: lu32i.d $a1, 0
; LA64D-LP64D-NEXT: movgr2fr.w $fa6, $a1
; LA64D-LP64D-NEXT: ori $a0, $a0, 3136
-; LA64D-LP64D-NEXT: lu32i.d $a0, 0
; LA64D-LP64D-NEXT: movgr2fr.w $fa7, $a0
; LA64D-LP64D-NEXT: ori $a0, $zero, 1
; LA64D-LP64D-NEXT: ori $a1, $zero, 2
@@ -1493,7 +1423,6 @@ define half @callee_half_ret() nounwind {
; LA64S: # %bb.0:
; LA64S-NEXT: lu12i.w $a0, -13
; LA64S-NEXT: ori $a0, $a0, 3072
-; LA64S-NEXT: lu32i.d $a0, 0
; LA64S-NEXT: movgr2fr.w $fa0, $a0
; LA64S-NEXT: ret
;
@@ -1501,14 +1430,12 @@ define half @callee_half_ret() nounwind {
; LA64F-LP64S: # %bb.0:
; LA64F-LP64S-NEXT: lu12i.w $a0, -13
; LA64F-LP64S-NEXT: ori $a0, $a0, 3072
-; LA64F-LP64S-NEXT: lu32i.d $a0, 0
; LA64F-LP64S-NEXT: ret
;
; LA64F-LP64D-LABEL: callee_half_ret:
; LA64F-LP64D: # %bb.0:
; LA64F-LP64D-NEXT: lu12i.w $a0, -13
; LA64F-LP64D-NEXT: ori $a0, $a0, 3072
-; LA64F-LP64D-NEXT: lu32i.d $a0, 0
; LA64F-LP64D-NEXT: movgr2fr.w $fa0, $a0
; LA64F-LP64D-NEXT: ret
;
@@ -1516,14 +1443,12 @@ define half @callee_half_ret() nounwind {
; LA64D-LP64S: # %bb.0:
; LA64D-LP64S-NEXT: lu12i.w $a0, -13
; LA64D-LP64S-NEXT: ori $a0, $a0, 3072
-; LA64D-LP64S-NEXT: lu32i.d $a0, 0
; LA64D-LP64S-NEXT: ret
;
; LA64D-LP64D-LABEL: callee_half_ret:
; LA64D-LP64D: # %bb.0:
; LA64D-LP64D-NEXT: lu12i.w $a0, -13
; LA64D-LP64D-NEXT: ori $a0, $a0, 3072
-; LA64D-LP64D-NEXT: lu32i.d $a0, 0
; LA64D-LP64D-NEXT: movgr2fr.w $fa0, $a0
; LA64D-LP64D-NEXT: ret
ret half 1.0
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
index a6e3f79..0d0fb21 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
@@ -76,7 +76,6 @@ define float @float_fsub_acquire(ptr %p) nounwind {
; LA64F: # %bb.0:
; LA64F-NEXT: fld.s $fa0, $a0, 0
; LA64F-NEXT: lu12i.w $a1, -264192
-; LA64F-NEXT: lu32i.d $a1, 0
; LA64F-NEXT: movgr2fr.w $fa1, $a1
; LA64F-NEXT: .p2align 4, , 16
; LA64F-NEXT: .LBB1_1: # %atomicrmw.start
@@ -641,7 +640,6 @@ define float @float_fsub_release(ptr %p) nounwind {
; LA64F: # %bb.0:
; LA64F-NEXT: fld.s $fa0, $a0, 0
; LA64F-NEXT: lu12i.w $a1, -264192
-; LA64F-NEXT: lu32i.d $a1, 0
; LA64F-NEXT: movgr2fr.w $fa1, $a1
; LA64F-NEXT: .p2align 4, , 16
; LA64F-NEXT: .LBB9_1: # %atomicrmw.start
@@ -1206,7 +1204,6 @@ define float @float_fsub_acq_rel(ptr %p) nounwind {
; LA64F: # %bb.0:
; LA64F-NEXT: fld.s $fa0, $a0, 0
; LA64F-NEXT: lu12i.w $a1, -264192
-; LA64F-NEXT: lu32i.d $a1, 0
; LA64F-NEXT: movgr2fr.w $fa1, $a1
; LA64F-NEXT: .p2align 4, , 16
; LA64F-NEXT: .LBB17_1: # %atomicrmw.start
@@ -1771,7 +1768,6 @@ define float @float_fsub_seq_cst(ptr %p) nounwind {
; LA64F: # %bb.0:
; LA64F-NEXT: fld.s $fa0, $a0, 0
; LA64F-NEXT: lu12i.w $a1, -264192
-; LA64F-NEXT: lu32i.d $a1, 0
; LA64F-NEXT: movgr2fr.w $fa1, $a1
; LA64F-NEXT: .p2align 4, , 16
; LA64F-NEXT: .LBB25_1: # %atomicrmw.start
@@ -2336,7 +2332,6 @@ define float @float_fsub_monotonic(ptr %p) nounwind {
; LA64F: # %bb.0:
; LA64F-NEXT: fld.s $fa0, $a0, 0
; LA64F-NEXT: lu12i.w $a1, -264192
-; LA64F-NEXT: lu32i.d $a1, 0
; LA64F-NEXT: movgr2fr.w $fa1, $a1
; LA64F-NEXT: .p2align 4, , 16
; LA64F-NEXT: .LBB33_1: # %atomicrmw.start
diff --git a/llvm/test/CodeGen/LoongArch/lasx/vselect.ll b/llvm/test/CodeGen/LoongArch/lasx/vselect.ll
index bf31ccb..559cc53 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/vselect.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/vselect.ll
@@ -32,6 +32,40 @@ define void @select_v32i8(ptr %res, ptr %a0, ptr %a1) nounwind {
ret void
}
+define void @select_v32i8_1(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: select_v32i8_1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xvld $xr0, $a1, 0
+; CHECK-NEXT: xvld $xr1, $a2, 0
+; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI2_0)
+; CHECK-NEXT: xvld $xr2, $a1, %pc_lo12(.LCPI2_0)
+; CHECK-NEXT: xvbitsel.v $xr0, $xr1, $xr0, $xr2
+; CHECK-NEXT: xvst $xr0, $a0, 0
+; CHECK-NEXT: ret
+ %v0 = load <32 x i8>, ptr %a0
+ %v1 = load <32 x i8>, ptr %a1
+ %sel = select <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <32 x i8> %v0, <32 x i8> %v1
+ store <32 x i8> %sel, ptr %res
+ ret void
+}
+
+define void @select_v32i8_2(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: select_v32i8_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xvld $xr0, $a1, 0
+; CHECK-NEXT: xvld $xr1, $a2, 0
+; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI3_0)
+; CHECK-NEXT: xvld $xr2, $a1, %pc_lo12(.LCPI3_0)
+; CHECK-NEXT: xvbitsel.v $xr0, $xr1, $xr0, $xr2
+; CHECK-NEXT: xvst $xr0, $a0, 0
+; CHECK-NEXT: ret
+ %v0 = load <32 x i8>, ptr %a0
+ %v1 = load <32 x i8>, ptr %a1
+ %sel = select <32 x i1> <i1 false, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <32 x i8> %v0, <32 x i8> %v1
+ store <32 x i8> %sel, ptr %res
+ ret void
+}
+
define void @select_v16i16(ptr %res, ptr %a0, ptr %a1) nounwind {
; CHECK-LABEL: select_v16i16:
; CHECK: # %bb.0:
@@ -49,6 +83,40 @@ define void @select_v16i16(ptr %res, ptr %a0, ptr %a1) nounwind {
ret void
}
+define void @select_v16i16_1(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: select_v16i16_1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xvld $xr0, $a1, 0
+; CHECK-NEXT: xvld $xr1, $a2, 0
+; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI5_0)
+; CHECK-NEXT: xvld $xr2, $a1, %pc_lo12(.LCPI5_0)
+; CHECK-NEXT: xvbitsel.v $xr0, $xr1, $xr0, $xr2
+; CHECK-NEXT: xvst $xr0, $a0, 0
+; CHECK-NEXT: ret
+ %v0 = load <16 x i16>, ptr %a0
+ %v1 = load <16 x i16>, ptr %a1
+ %sel = select <16 x i1> <i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i16> %v0, <16 x i16> %v1
+ store <16 x i16> %sel, ptr %res
+ ret void
+}
+
+define void @select_v16i16_2(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: select_v16i16_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xvld $xr0, $a1, 0
+; CHECK-NEXT: xvld $xr1, $a2, 0
+; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI6_0)
+; CHECK-NEXT: xvld $xr2, $a1, %pc_lo12(.LCPI6_0)
+; CHECK-NEXT: xvbitsel.v $xr0, $xr1, $xr0, $xr2
+; CHECK-NEXT: xvst $xr0, $a0, 0
+; CHECK-NEXT: ret
+ %v0 = load <16 x i16>, ptr %a0
+ %v1 = load <16 x i16>, ptr %a1
+ %sel = select <16 x i1> <i1 false, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <16 x i16> %v0, <16 x i16> %v1
+ store <16 x i16> %sel, ptr %res
+ ret void
+}
+
define void @select_v8i32(ptr %res, ptr %a0, ptr %a1) nounwind {
; CHECK-LABEL: select_v8i32:
; CHECK: # %bb.0:
@@ -65,19 +133,70 @@ define void @select_v8i32(ptr %res, ptr %a0, ptr %a1) nounwind {
ret void
}
+define void @select_v8i32_1(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: select_v8i32_1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xvld $xr0, $a1, 0
+; CHECK-NEXT: xvld $xr1, $a2, 0
+; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI8_0)
+; CHECK-NEXT: xvld $xr2, $a1, %pc_lo12(.LCPI8_0)
+; CHECK-NEXT: xvbitsel.v $xr0, $xr1, $xr0, $xr2
+; CHECK-NEXT: xvst $xr0, $a0, 0
+; CHECK-NEXT: ret
+ %v0 = load <8 x i32>, ptr %a0
+ %v1 = load <8 x i32>, ptr %a1
+ %sel = select <8 x i1> <i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <8 x i32> %v0, <8 x i32> %v1
+ store <8 x i32> %sel, ptr %res
+ ret void
+}
+
+define void @select_v8f32(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: select_v8f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xvld $xr0, $a1, 0
+; CHECK-NEXT: xvld $xr1, $a2, 0
+; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI9_0)
+; CHECK-NEXT: xvld $xr2, $a1, %pc_lo12(.LCPI9_0)
+; CHECK-NEXT: xvbitsel.v $xr0, $xr1, $xr0, $xr2
+; CHECK-NEXT: xvst $xr0, $a0, 0
+; CHECK-NEXT: ret
+ %v0 = load <8 x float>, ptr %a0
+ %v1 = load <8 x float>, ptr %a1
+ %sel = select <8 x i1> <i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false>, <8 x float> %v0, <8 x float> %v1
+ store <8 x float> %sel, ptr %res
+ ret void
+}
+
define void @select_v4i64(ptr %res, ptr %a0, ptr %a1) nounwind {
; CHECK-LABEL: select_v4i64:
; CHECK: # %bb.0:
; CHECK-NEXT: xvld $xr0, $a1, 0
; CHECK-NEXT: xvld $xr1, $a2, 0
-; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI4_0)
-; CHECK-NEXT: xvld $xr2, $a1, %pc_lo12(.LCPI4_0)
+; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI10_0)
+; CHECK-NEXT: xvld $xr2, $a1, %pc_lo12(.LCPI10_0)
; CHECK-NEXT: xvbitsel.v $xr0, $xr1, $xr0, $xr2
; CHECK-NEXT: xvst $xr0, $a0, 0
; CHECK-NEXT: ret
%v0 = load <4 x i64>, ptr %a0
%v1 = load <4 x i64>, ptr %a1
- %sel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i64> %v0, <4 x i64> %v1
+ %sel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i64> %v0, <4 x i64> %v1
store <4 x i64> %sel, ptr %res
ret void
}
+
+define void @select_v4f64(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: select_v4f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xvld $xr0, $a1, 0
+; CHECK-NEXT: xvld $xr1, $a2, 0
+; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI11_0)
+; CHECK-NEXT: xvld $xr2, $a1, %pc_lo12(.LCPI11_0)
+; CHECK-NEXT: xvbitsel.v $xr0, $xr1, $xr0, $xr2
+; CHECK-NEXT: xvst $xr0, $a0, 0
+; CHECK-NEXT: ret
+ %v0 = load <4 x double>, ptr %a0
+ %v1 = load <4 x double>, ptr %a1
+ %sel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x double> %v0, <4 x double> %v1
+ store <4 x double> %sel, ptr %res
+ ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/vselect.ll b/llvm/test/CodeGen/LoongArch/lsx/vselect.ll
index 8f25a6b..25c4f09 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/vselect.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vselect.ll
@@ -16,6 +16,20 @@ define void @select_v16i8_imm(ptr %res, ptr %a0) nounwind {
ret void
}
+define void @select_v16i8_imm_1(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: select_v16i8_imm_1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vld $vr0, $a1, 0
+; CHECK-NEXT: vrepli.h $vr1, -256
+; CHECK-NEXT: vbitseli.b $vr1, $vr0, 1
+; CHECK-NEXT: vst $vr1, $a0, 0
+; CHECK-NEXT: ret
+ %v0 = load <16 x i8>, ptr %a0
+ %sel = select <16 x i1> <i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, <16 x i8> %v0
+ store <16 x i8> %sel, ptr %res
+ ret void
+}
+
define void @select_v16i8(ptr %res, ptr %a0, ptr %a1) nounwind {
; CHECK-LABEL: select_v16i8:
; CHECK: # %bb.0:
@@ -32,6 +46,40 @@ define void @select_v16i8(ptr %res, ptr %a0, ptr %a1) nounwind {
ret void
}
+define void @select_v16i8_1(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: select_v16i8_1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vld $vr0, $a1, 0
+; CHECK-NEXT: vld $vr1, $a2, 0
+; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI3_0)
+; CHECK-NEXT: vld $vr2, $a1, %pc_lo12(.LCPI3_0)
+; CHECK-NEXT: vbitsel.v $vr0, $vr1, $vr0, $vr2
+; CHECK-NEXT: vst $vr0, $a0, 0
+; CHECK-NEXT: ret
+ %v0 = load <16 x i8>, ptr %a0
+ %v1 = load <16 x i8>, ptr %a1
+ %sel = select <16 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> %v0, <16 x i8> %v1
+ store <16 x i8> %sel, ptr %res
+ ret void
+}
+
+define void @select_v16i8_2(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: select_v16i8_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vld $vr0, $a1, 0
+; CHECK-NEXT: vld $vr1, $a2, 0
+; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI4_0)
+; CHECK-NEXT: vld $vr2, $a1, %pc_lo12(.LCPI4_0)
+; CHECK-NEXT: vbitsel.v $vr0, $vr1, $vr0, $vr2
+; CHECK-NEXT: vst $vr0, $a0, 0
+; CHECK-NEXT: ret
+ %v0 = load <16 x i8>, ptr %a0
+ %v1 = load <16 x i8>, ptr %a1
+ %sel = select <16 x i1> <i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false>, <16 x i8> %v0, <16 x i8> %v1
+ store <16 x i8> %sel, ptr %res
+ ret void
+}
+
define void @select_v8i16(ptr %res, ptr %a0, ptr %a1) nounwind {
; CHECK-LABEL: select_v8i16:
; CHECK: # %bb.0:
@@ -49,6 +97,40 @@ define void @select_v8i16(ptr %res, ptr %a0, ptr %a1) nounwind {
ret void
}
+define void @select_v8i16_1(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: select_v8i16_1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vld $vr0, $a1, 0
+; CHECK-NEXT: vld $vr1, $a2, 0
+; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI6_0)
+; CHECK-NEXT: vld $vr2, $a1, %pc_lo12(.LCPI6_0)
+; CHECK-NEXT: vbitsel.v $vr0, $vr1, $vr0, $vr2
+; CHECK-NEXT: vst $vr0, $a0, 0
+; CHECK-NEXT: ret
+ %v0 = load <8 x i16>, ptr %a0
+ %v1 = load <8 x i16>, ptr %a1
+ %sel = select <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i16> %v0, <8 x i16> %v1
+ store <8 x i16> %sel, ptr %res
+ ret void
+}
+
+define void @select_v8i16_2(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: select_v8i16_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vld $vr0, $a1, 0
+; CHECK-NEXT: vld $vr1, $a2, 0
+; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI7_0)
+; CHECK-NEXT: vld $vr2, $a1, %pc_lo12(.LCPI7_0)
+; CHECK-NEXT: vbitsel.v $vr0, $vr1, $vr0, $vr2
+; CHECK-NEXT: vst $vr0, $a0, 0
+; CHECK-NEXT: ret
+ %v0 = load <8 x i16>, ptr %a0
+ %v1 = load <8 x i16>, ptr %a1
+ %sel = select <8 x i1> <i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i16> %v0, <8 x i16> %v1
+ store <8 x i16> %sel, ptr %res
+ ret void
+}
+
define void @select_v4i32(ptr %res, ptr %a0, ptr %a1) nounwind {
; CHECK-LABEL: select_v4i32:
; CHECK: # %bb.0:
@@ -65,13 +147,47 @@ define void @select_v4i32(ptr %res, ptr %a0, ptr %a1) nounwind {
ret void
}
+define void @select_v4i32_1(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: select_v4i32_1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vld $vr0, $a1, 0
+; CHECK-NEXT: vld $vr1, $a2, 0
+; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI9_0)
+; CHECK-NEXT: vld $vr2, $a1, %pc_lo12(.LCPI9_0)
+; CHECK-NEXT: vbitsel.v $vr0, $vr1, $vr0, $vr2
+; CHECK-NEXT: vst $vr0, $a0, 0
+; CHECK-NEXT: ret
+ %v0 = load <4 x i32>, ptr %a0
+ %v1 = load <4 x i32>, ptr %a1
+ %sel = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i32> %v0, <4 x i32> %v1
+ store <4 x i32> %sel, ptr %res
+ ret void
+}
+
+define void @select_v4f32(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: select_v4f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vld $vr0, $a1, 0
+; CHECK-NEXT: vld $vr1, $a2, 0
+; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI10_0)
+; CHECK-NEXT: vld $vr2, $a1, %pc_lo12(.LCPI10_0)
+; CHECK-NEXT: vbitsel.v $vr0, $vr1, $vr0, $vr2
+; CHECK-NEXT: vst $vr0, $a0, 0
+; CHECK-NEXT: ret
+ %v0 = load <4 x float>, ptr %a0
+ %v1 = load <4 x float>, ptr %a1
+ %sel = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %v0, <4 x float> %v1
+ store <4 x float> %sel, ptr %res
+ ret void
+}
+
define void @select_v2i64(ptr %res, ptr %a0, ptr %a1) nounwind {
; CHECK-LABEL: select_v2i64:
; CHECK: # %bb.0:
; CHECK-NEXT: vld $vr0, $a1, 0
; CHECK-NEXT: vld $vr1, $a2, 0
-; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI4_0)
-; CHECK-NEXT: vld $vr2, $a1, %pc_lo12(.LCPI4_0)
+; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI11_0)
+; CHECK-NEXT: vld $vr2, $a1, %pc_lo12(.LCPI11_0)
; CHECK-NEXT: vbitsel.v $vr0, $vr1, $vr0, $vr2
; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
@@ -81,3 +197,20 @@ define void @select_v2i64(ptr %res, ptr %a0, ptr %a1) nounwind {
store <2 x i64> %sel, ptr %res
ret void
}
+
+define void @select_v2f64(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: select_v2f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vld $vr0, $a1, 0
+; CHECK-NEXT: vld $vr1, $a2, 0
+; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI12_0)
+; CHECK-NEXT: vld $vr2, $a1, %pc_lo12(.LCPI12_0)
+; CHECK-NEXT: vbitsel.v $vr0, $vr1, $vr0, $vr2
+; CHECK-NEXT: vst $vr0, $a0, 0
+; CHECK-NEXT: ret
+ %v0 = load <2 x double>, ptr %a0
+ %v1 = load <2 x double>, ptr %a1
+ %sel = select <2 x i1> <i1 false, i1 true>, <2 x double> %v0, <2 x double> %v1
+ store <2 x double> %sel, ptr %res
+ ret void
+}
diff --git a/llvm/test/CodeGen/MIR/AArch64/return-address-signing.mir b/llvm/test/CodeGen/MIR/AArch64/return-address-signing.mir
index 1030917..302f70f 100644
--- a/llvm/test/CodeGen/MIR/AArch64/return-address-signing.mir
+++ b/llvm/test/CodeGen/MIR/AArch64/return-address-signing.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=aarch64 -run-pass=prologepilog -run-pass=aarch64-ptrauth -o - %s 2>&1 | FileCheck %s
+# RUN: llc -mtriple=aarch64 -run-pass=prologepilog -run-pass=aarch64-ptrauth -o - %s 2>&1 | FileCheck --strict-whitespace %s
--- |
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64"
diff --git a/llvm/test/CodeGen/NVPTX/i32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i32x2-instructions.ll
index 153ca10..72f10ae 100644
--- a/llvm/test/CodeGen/NVPTX/i32x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i32x2-instructions.ll
@@ -1141,29 +1141,88 @@ define <2 x i32> @test_select_cc(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x
ret <2 x i32> %r
}
-define <2 x i16> @test_trunc_2xi32(<2 x i32> %a) #0 {
-; CHECK-NOI32X2-LABEL: test_trunc_2xi32(
+define <2 x i16> @test_trunc_2xi32_to_2xi16(<2 x i32> %a) #0 {
+; CHECK-NOI32X2-LABEL: test_trunc_2xi32_to_2xi16(
; CHECK-NOI32X2: {
; CHECK-NOI32X2-NEXT: .reg .b32 %r<4>;
; CHECK-NOI32X2-EMPTY:
; CHECK-NOI32X2-NEXT: // %bb.0:
-; CHECK-NOI32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_trunc_2xi32_param_0];
+; CHECK-NOI32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_trunc_2xi32_to_2xi16_param_0];
; CHECK-NOI32X2-NEXT: prmt.b32 %r3, %r1, %r2, 0x5410U;
; CHECK-NOI32X2-NEXT: st.param.b32 [func_retval0], %r3;
; CHECK-NOI32X2-NEXT: ret;
;
-; CHECK-I32X2-LABEL: test_trunc_2xi32(
+; CHECK-I32X2-LABEL: test_trunc_2xi32_to_2xi16(
; CHECK-I32X2: {
+; CHECK-I32X2-NEXT: .reg .b32 %r<4>;
; CHECK-I32X2-NEXT: .reg .b64 %rd<2>;
; CHECK-I32X2-EMPTY:
; CHECK-I32X2-NEXT: // %bb.0:
-; CHECK-I32X2-NEXT: ld.param.b64 %rd1, [test_trunc_2xi32_param_0];
-; CHECK-I32X2-NEXT: st.param.b32 [func_retval0], %rd1;
+; CHECK-I32X2-NEXT: ld.param.b64 %rd1, [test_trunc_2xi32_to_2xi16_param_0];
+; CHECK-I32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-I32X2-NEXT: prmt.b32 %r3, %r1, %r2, 0x5410U;
+; CHECK-I32X2-NEXT: st.param.b32 [func_retval0], %r3;
; CHECK-I32X2-NEXT: ret;
%r = trunc <2 x i32> %a to <2 x i16>
ret <2 x i16> %r
}
+define <2 x i8> @test_trunc_2xi32_to_2xi8(<2 x i32> %a) #0 {
+; CHECK-NOI32X2-LABEL: test_trunc_2xi32_to_2xi8(
+; CHECK-NOI32X2: {
+; CHECK-NOI32X2-NEXT: .reg .b16 %rs<3>;
+; CHECK-NOI32X2-NEXT: .reg .b32 %r<3>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT: // %bb.0:
+; CHECK-NOI32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_trunc_2xi32_to_2xi8_param_0];
+; CHECK-NOI32X2-NEXT: cvt.u16.u32 %rs1, %r2;
+; CHECK-NOI32X2-NEXT: cvt.u16.u32 %rs2, %r1;
+; CHECK-NOI32X2-NEXT: st.param.v2.b8 [func_retval0], {%rs2, %rs1};
+; CHECK-NOI32X2-NEXT: ret;
+;
+; CHECK-I32X2-LABEL: test_trunc_2xi32_to_2xi8(
+; CHECK-I32X2: {
+; CHECK-I32X2-NEXT: .reg .b16 %rs<3>;
+; CHECK-I32X2-NEXT: .reg .b32 %r<3>;
+; CHECK-I32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT: // %bb.0:
+; CHECK-I32X2-NEXT: ld.param.b64 %rd1, [test_trunc_2xi32_to_2xi8_param_0];
+; CHECK-I32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-I32X2-NEXT: cvt.u16.u32 %rs1, %r2;
+; CHECK-I32X2-NEXT: cvt.u16.u32 %rs2, %r1;
+; CHECK-I32X2-NEXT: st.param.v2.b8 [func_retval0], {%rs2, %rs1};
+; CHECK-I32X2-NEXT: ret;
+ %r = trunc <2 x i32> %a to <2 x i8>
+ ret <2 x i8> %r
+}
+
+define <2 x i1> @test_trunc_2xi32_to_2xi1(<2 x i32> %a) #0 {
+; CHECK-NOI32X2-LABEL: test_trunc_2xi32_to_2xi1(
+; CHECK-NOI32X2: {
+; CHECK-NOI32X2-NEXT: .reg .b32 %r<3>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT: // %bb.0:
+; CHECK-NOI32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_trunc_2xi32_to_2xi1_param_0];
+; CHECK-NOI32X2-NEXT: st.param.b8 [func_retval0], %r1;
+; CHECK-NOI32X2-NEXT: st.param.b8 [func_retval0+1], %r2;
+; CHECK-NOI32X2-NEXT: ret;
+;
+; CHECK-I32X2-LABEL: test_trunc_2xi32_to_2xi1(
+; CHECK-I32X2: {
+; CHECK-I32X2-NEXT: .reg .b32 %r<3>;
+; CHECK-I32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT: // %bb.0:
+; CHECK-I32X2-NEXT: ld.param.b64 %rd1, [test_trunc_2xi32_to_2xi1_param_0];
+; CHECK-I32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-I32X2-NEXT: st.param.b8 [func_retval0], %r1;
+; CHECK-I32X2-NEXT: st.param.b8 [func_retval0+1], %r2;
+; CHECK-I32X2-NEXT: ret;
+ %r = trunc <2 x i32> %a to <2 x i1>
+ ret <2 x i1> %r
+}
+
define <2 x i32> @test_trunc_2xi64(<2 x i64> %a) #0 {
; CHECK-LABEL: test_trunc_2xi64(
; CHECK: {
@@ -1180,14 +1239,14 @@ define <2 x i32> @test_trunc_2xi64(<2 x i64> %a) #0 {
ret <2 x i32> %r
}
-define <2 x i32> @test_zext_2xi32(<2 x i16> %a) #0 {
-; CHECK-LABEL: test_zext_2xi32(
+define <2 x i32> @test_zext_2xi16_to_2xi32(<2 x i16> %a) #0 {
+; CHECK-LABEL: test_zext_2xi16_to_2xi32(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-NEXT: .reg .b32 %r<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b32 %r1, [test_zext_2xi32_param_0];
+; CHECK-NEXT: ld.param.b32 %r1, [test_zext_2xi16_to_2xi32_param_0];
; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
; CHECK-NEXT: cvt.u32.u16 %r2, %rs2;
; CHECK-NEXT: cvt.u32.u16 %r3, %rs1;
@@ -1197,6 +1256,47 @@ define <2 x i32> @test_zext_2xi32(<2 x i16> %a) #0 {
ret <2 x i32> %r
}
+define <2 x i32> @test_zext_2xi8_to_2xi32(<2 x i8> %a) #0 {
+; CHECK-LABEL: test_zext_2xi8_to_2xi32(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [test_zext_2xi8_to_2xi32_param_0];
+; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2};
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs2;
+; CHECK-NEXT: cvt.u32.u16 %r3, %rs1;
+; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r3, %r2};
+; CHECK-NEXT: ret;
+ %r = zext <2 x i8> %a to <2 x i32>
+ ret <2 x i32> %r
+}
+
+define <2 x i32> @test_zext_2xi1_to_2xi32(<2 x i1> %a) #0 {
+; CHECK-LABEL: test_zext_2xi1_to_2xi32(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<5>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b8 %rs1, [test_zext_2xi1_to_2xi32_param_0+1];
+; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT: setp.ne.b16 %p2, %rs2, 0;
+; CHECK-NEXT: ld.param.b8 %rs3, [test_zext_2xi1_to_2xi32_param_0];
+; CHECK-NEXT: and.b16 %rs4, %rs3, 1;
+; CHECK-NEXT: setp.ne.b16 %p1, %rs4, 0;
+; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
+; CHECK-NEXT: and.b32 %r2, %r1, 1;
+; CHECK-NEXT: cvt.u32.u16 %r3, %rs3;
+; CHECK-NEXT: and.b32 %r4, %r3, 1;
+; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r2};
+; CHECK-NEXT: ret;
+ %r = zext <2 x i1> %a to <2 x i32>
+ ret <2 x i32> %r
+}
+
define <2 x i64> @test_zext_2xi64(<2 x i32> %a) #0 {
; CHECK-NOI32X2-LABEL: test_zext_2xi64(
; CHECK-NOI32X2: {
@@ -1566,6 +1666,55 @@ entry:
ret void
}
+define <2 x i32> @test_sext_v2i8_to_v2i32 (<2 x i8> %a) {
+; CHECK-LABEL: test_sext_v2i8_to_v2i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [test_sext_v2i8_to_v2i32_param_0];
+; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2};
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs2;
+; CHECK-NEXT: cvt.s32.s8 %r3, %r2;
+; CHECK-NEXT: cvt.u32.u16 %r4, %rs1;
+; CHECK-NEXT: cvt.s32.s8 %r5, %r4;
+; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r5, %r3};
+; CHECK-NEXT: ret;
+ %r = sext <2 x i8> %a to <2 x i32>
+ ret <2 x i32> %r
+}
+
+define <2 x i32> @test_sext_v2i16_to_v2i32 (<2 x i16> %a) {
+; CHECK-NOI32X2-LABEL: test_sext_v2i16_to_v2i32(
+; CHECK-NOI32X2: {
+; CHECK-NOI32X2-NEXT: .reg .b16 %rs<2>;
+; CHECK-NOI32X2-NEXT: .reg .b32 %r<4>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT: // %bb.0:
+; CHECK-NOI32X2-NEXT: ld.param.b32 %r1, [test_sext_v2i16_to_v2i32_param_0];
+; CHECK-NOI32X2-NEXT: cvt.s32.s16 %r2, %r1;
+; CHECK-NOI32X2-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r1; }
+; CHECK-NOI32X2-NEXT: cvt.s32.s16 %r3, %rs1;
+; CHECK-NOI32X2-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r3};
+; CHECK-NOI32X2-NEXT: ret;
+;
+; CHECK-I32X2-LABEL: test_sext_v2i16_to_v2i32(
+; CHECK-I32X2: {
+; CHECK-I32X2-NEXT: .reg .b16 %rs<2>;
+; CHECK-I32X2-NEXT: .reg .b32 %r<4>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT: // %bb.0:
+; CHECK-I32X2-NEXT: ld.param.b32 %r1, [test_sext_v2i16_to_v2i32_param_0];
+; CHECK-I32X2-NEXT: cvt.s32.s16 %r2, %r1;
+; CHECK-I32X2-NEXT: mov.b32 {_, %rs1}, %r1;
+; CHECK-I32X2-NEXT: cvt.s32.s16 %r3, %rs1;
+; CHECK-I32X2-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r3};
+; CHECK-I32X2-NEXT: ret;
+ %r = sext <2 x i16> %a to <2 x i32>
+ ret <2 x i32> %r
+}
+
define <2 x float> @test_uitofp_v2i32(<2 x i32> %a) {
; CHECK-NOI32X2-LABEL: test_uitofp_v2i32(
; CHECK-NOI32X2: {
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll b/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll
index 1edb387..f345e08 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll
@@ -2,9 +2,13 @@
; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK_PTX64 %s
; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | FileCheck --check-prefixes=CHECK_PTX64_SHARED32 %s
; RUN: llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | FileCheck --check-prefixes=CHECK_PTX64 %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK_PTX64 %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK_PTX64 %s
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %}
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100a %}
; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
declare void @llvm.nvvm.tcgen05.alloc.cg1(ptr %addr, i32 %ncols)
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll b/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll
index 2e80c4c..29b130f 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll
@@ -2,9 +2,13 @@
; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK_PTX64 %s
; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | FileCheck --check-prefixes=CHECK_PTX64_SHARED32 %s
; RUN: llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | FileCheck --check-prefixes=CHECK_PTX64 %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK_PTX64 %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK_PTX64 %s
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %}
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100a %}
; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
declare void @llvm.nvvm.tcgen05.commit.cg1(ptr %bar_addr)
declare void @llvm.nvvm.tcgen05.commit.cg2(ptr %bar_addr)
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll b/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll
index 817b1d5..4e463a14 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll
@@ -1,8 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s
; RUN: llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | FileCheck --check-prefixes=CHECK %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK %s
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %}
; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
define void @test_tcgen05_cp_64x128_v1_cg1(ptr addrspace(6) %addr, i64 %sdesc) {
; CHECK-LABEL: test_tcgen05_cp_64x128_v1_cg1(
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-fence.ll b/llvm/test/CodeGen/NVPTX/tcgen05-fence.ll
index cbf647f..fc8cce4 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-fence.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-fence.ll
@@ -1,8 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s
; RUN: llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | FileCheck --check-prefixes=CHECK %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK %s
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %}
; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
declare void @llvm.nvvm.tcgen05.fence.before.thread.sync()
declare void @llvm.nvvm.tcgen05.fence.after.thread.sync()
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll b/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll
index a37b1a9..22eb729 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll
@@ -2,9 +2,13 @@
; RUN: llc < %s -o - -mcpu=sm_100a -march=nvptx64 -mattr=+ptx86 | FileCheck %s
; RUN: llc < %s -o - -mcpu=sm_101a -march=nvptx64 -mattr=+ptx86 | FileCheck %s
; RUN: llc < %s -o - -mcpu=sm_103a -march=nvptx64 -mattr=+ptx88 | FileCheck %s
+; RUN: llc < %s -o - -mcpu=sm_100f -march=nvptx64 -mattr=+ptx88 | FileCheck %s
+; RUN: llc < %s -o - -mcpu=sm_110f -march=nvptx64 -mattr=+ptx90 | FileCheck %s
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mattr=+ptx86 -mcpu=sm_100a | %ptxas-verify -arch=sm_100a %}
; RUN: %if ptxas-sm_101a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mattr=+ptx86 -mcpu=sm_101a | %ptxas-verify -arch=sm_101a %}
; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mattr=+ptx88 -mcpu=sm_103a | %ptxas-verify -arch=sm_103a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mattr=+ptx88 -mcpu=sm_100f | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mattr=+ptx90 -mcpu=sm_110f | %ptxas-verify -arch=sm_110f %}
; CHECK-LABEL: nvvm_tcgen05_ld_16x64b
define void @nvvm_tcgen05_ld_16x64b(ptr addrspace(6) %taddr) {
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll b/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll
index bf2adac..33483b5 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll
@@ -1,8 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s
; RUN: llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | FileCheck --check-prefixes=CHECK %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_110a -mattr=+ptx90 | FileCheck --check-prefixes=CHECK %s
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %}
; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %}
+; RUN: %if ptxas-sm_110a && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mcpu=sm_110a -mattr=+ptx90 | %ptxas-verify -arch=sm_110a %}
declare void @llvm.nvvm.tcgen05.shift.down.cg1(ptr addrspace(6) %tmem_addr)
declare void @llvm.nvvm.tcgen05.shift.down.cg2(ptr addrspace(6) %tmem_addr)
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-st.ll b/llvm/test/CodeGen/NVPTX/tcgen05-st.ll
index 0636a06..ccf6541 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-st.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-st.ll
@@ -2,9 +2,13 @@
; RUN: llc < %s -o - -mcpu=sm_100a -march=nvptx64 -mattr=+ptx86 | FileCheck %s
; RUN: llc < %s -o - -mcpu=sm_101a -march=nvptx64 -mattr=+ptx86 | FileCheck %s
; RUN: llc < %s -o - -mcpu=sm_103a -march=nvptx64 -mattr=+ptx88 | FileCheck %s
+; RUN: llc < %s -o - -mcpu=sm_100f -march=nvptx64 -mattr=+ptx88 | FileCheck %s
+; RUN: llc < %s -o - -mcpu=sm_110f -march=nvptx64 -mattr=+ptx90 | FileCheck %s
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %}
; RUN: %if ptxas-sm_101a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_101a -mattr=+ptx86 | %ptxas-verify -arch=sm_101a %}
; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
; CHECK-LABEL: nvvm_tcgen05_st_16x64b
define void @nvvm_tcgen05_st_16x64b(ptr addrspace(6) %taddr, i32 %stv1, <2 x i32> %stv2, <4 x i32> %stv4, <8 x i32> %stv8, <16 x i32> %stv16, <32 x i32> %stv32, <64 x i32> %stv64, <128 x i32> %stv128) {
diff --git a/llvm/test/CodeGen/PowerPC/addition-vector-all-ones.ll b/llvm/test/CodeGen/PowerPC/addition-vector-all-ones.ll
new file mode 100644
index 0000000..e67d031
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/addition-vector-all-ones.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
+
+; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc64-ibm-aix \
+; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
+
+; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc-ibm-aix \
+; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
+
+; The addition of vector `A` with vector of 1s currently uses `vspltisw` to generate vector of 1s followed by add operation.
+
+; Function for the vector type v2i64 `a + {1, 1}`
+define <2 x i64> @test_v2i64(<2 x i64> %a) {
+; CHECK-LABEL: test_v2i64:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vspltisw v3, 1
+; CHECK-NEXT: vupklsw v3, v3
+; CHECK-NEXT: vaddudm v2, v2, v3
+; CHECK-NEXT: blr
+entry:
+ %add = add <2 x i64> %a, splat (i64 1)
+ ret <2 x i64> %add
+}
+
+; Function for the vector type v4i32 `a + {1, 1, 1, 1}`
+define <4 x i32> @test_v4i32(<4 x i32> %a) {
+; CHECK-LABEL: test_v4i32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vspltisw v3, 1
+; CHECK-NEXT: vadduwm v2, v2, v3
+; CHECK-NEXT: blr
+entry:
+ %add = add <4 x i32> %a, splat (i32 1)
+ ret <4 x i32> %add
+}
+
+; Function for the vector type v8i16 `a + {1, 1, 1, 1, 1, 1, 1, 1}`
+define <8 x i16> @test_v8i16(<8 x i16> %a) {
+; CHECK-LABEL: test_v8i16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vspltish v3, 1
+; CHECK-NEXT: vadduhm v2, v2, v3
+; CHECK-NEXT: blr
+entry:
+ %add = add <8 x i16> %a, splat (i16 1)
+ ret <8 x i16> %add
+}
+
+; Function for the vector type v16i8 `a + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}`
+define <16 x i8> @test_16i8(<16 x i8> %a) {
+; CHECK-LABEL: test_16i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xxspltib v3, 1
+; CHECK-NEXT: vaddubm v2, v2, v3
+; CHECK-NEXT: blr
+entry:
+ %add = add <16 x i8> %a, splat (i8 1)
+ ret <16 x i8> %add
+}
diff --git a/llvm/test/CodeGen/PowerPC/check-zero-vector.ll b/llvm/test/CodeGen/PowerPC/compare-vector-with-zero.ll
index 0f7e0c7..1325abf 100644
--- a/llvm/test/CodeGen/PowerPC/check-zero-vector.ll
+++ b/llvm/test/CodeGen/PowerPC/compare-vector-with-zero.ll
@@ -95,3 +95,80 @@ declare i4 @llvm.ctpop.i4(i4) #1
!6 = !{!"short", !7, i64 0}
!7 = !{!"omnipotent char", !8, i64 0}
!8 = !{!"Simple C/C++ TBAA"}
+
+; Function to lockdown changes for floating point vector comparisons
+define range(i32 0, 5) i32 @cols_needed(ptr %colauths){
+; POWERPC_64LE-LABEL: cols_needed:
+; POWERPC_64LE: # %bb.0: # %entry
+; POWERPC_64LE-NEXT: lxv vs0, 0(r3)
+; POWERPC_64LE-NEXT: xxlxor vs1, vs1, vs1
+; POWERPC_64LE-NEXT: li r4, 4
+; POWERPC_64LE-NEXT: li r3, 0
+; POWERPC_64LE-NEXT: xvcmpeqsp vs0, vs0, vs1
+; POWERPC_64LE-NEXT: xxlnor v2, vs0, vs0
+; POWERPC_64LE-NEXT: vextuwrx r4, r4, v2
+; POWERPC_64LE-NEXT: vextuwrx r3, r3, v2
+; POWERPC_64LE-NEXT: rlwinm r4, r4, 1, 30, 30
+; POWERPC_64LE-NEXT: sub r3, r4, r3
+; POWERPC_64LE-NEXT: mfvsrwz r4, v2
+; POWERPC_64LE-NEXT: rlwinm r4, r4, 2, 29, 29
+; POWERPC_64LE-NEXT: or r3, r3, r4
+; POWERPC_64LE-NEXT: li r4, 12
+; POWERPC_64LE-NEXT: vextuwrx r4, r4, v2
+; POWERPC_64LE-NEXT: slwi r4, r4, 3
+; POWERPC_64LE-NEXT: or r3, r3, r4
+; POWERPC_64LE-NEXT: clrlwi r3, r3, 28
+; POWERPC_64LE-NEXT: stb r3, -1(r1)
+; POWERPC_64LE-NEXT: lbz r3, -1(r1)
+; POWERPC_64LE-NEXT: popcntd r3, r3
+; POWERPC_64LE-NEXT: blr
+;
+; POWERPC_64-LABEL: cols_needed:
+; POWERPC_64: # %bb.0: # %entry
+; POWERPC_64-NEXT: lxv vs0, 0(r3)
+; POWERPC_64-NEXT: xxlxor vs1, vs1, vs1
+; POWERPC_64-NEXT: li r4, 8
+; POWERPC_64-NEXT: xvcmpeqsp vs0, vs0, vs1
+; POWERPC_64-NEXT: xxlnor v2, vs0, vs0
+; POWERPC_64-NEXT: vextuwlx r4, r4, v2
+; POWERPC_64-NEXT: mfvsrwz r3, v2
+; POWERPC_64-NEXT: rlwinm r4, r4, 1, 30, 30
+; POWERPC_64-NEXT: rlwimi r4, r3, 2, 29, 29
+; POWERPC_64-NEXT: li r3, 0
+; POWERPC_64-NEXT: vextuwlx r3, r3, v2
+; POWERPC_64-NEXT: rlwimi r4, r3, 3, 0, 28
+; POWERPC_64-NEXT: li r3, 12
+; POWERPC_64-NEXT: vextuwlx r3, r3, v2
+; POWERPC_64-NEXT: sub r3, r4, r3
+; POWERPC_64-NEXT: clrlwi r3, r3, 28
+; POWERPC_64-NEXT: stb r3, -1(r1)
+; POWERPC_64-NEXT: lbz r3, -1(r1)
+; POWERPC_64-NEXT: popcntd r3, r3
+; POWERPC_64-NEXT: blr
+;
+; POWERPC_32-LABEL: cols_needed:
+; POWERPC_32: # %bb.0: # %entry
+; POWERPC_32-NEXT: lxv vs0, 0(r3)
+; POWERPC_32-NEXT: xxlxor vs1, vs1, vs1
+; POWERPC_32-NEXT: xvcmpeqsp vs0, vs0, vs1
+; POWERPC_32-NEXT: xxlnor vs0, vs0, vs0
+; POWERPC_32-NEXT: stxv vs0, -32(r1)
+; POWERPC_32-NEXT: lwz r3, -24(r1)
+; POWERPC_32-NEXT: lwz r4, -28(r1)
+; POWERPC_32-NEXT: rlwinm r3, r3, 1, 30, 30
+; POWERPC_32-NEXT: rlwimi r3, r4, 2, 29, 29
+; POWERPC_32-NEXT: lwz r4, -32(r1)
+; POWERPC_32-NEXT: rlwimi r3, r4, 3, 0, 28
+; POWERPC_32-NEXT: lwz r4, -20(r1)
+; POWERPC_32-NEXT: sub r3, r3, r4
+; POWERPC_32-NEXT: clrlwi r3, r3, 28
+; POWERPC_32-NEXT: popcntw r3, r3
+; POWERPC_32-NEXT: blr
+entry:
+ %0 = load <4 x float>, ptr %colauths, align 4, !tbaa !5
+ %1 = fcmp une <4 x float> %0, zeroinitializer
+ %2 = bitcast <4 x i1> %1 to i4
+ %3 = tail call range(i4 0, 5) i4 @llvm.ctpop.i4(i4 %2)
+ %4 = zext nneg i4 %3 to i32
+ ret i32 %4
+}
diff --git a/llvm/test/CodeGen/PowerPC/fmf-propagation.ll b/llvm/test/CodeGen/PowerPC/fmf-propagation.ll
index e71f59c..cad684e 100644
--- a/llvm/test/CodeGen/PowerPC/fmf-propagation.ll
+++ b/llvm/test/CodeGen/PowerPC/fmf-propagation.ll
@@ -325,24 +325,21 @@ define float @sqrt_afn_ieee(float %x) #0 {
;
; GLOBAL-LABEL: sqrt_afn_ieee:
; GLOBAL: # %bb.0:
-; GLOBAL-NEXT: addis 3, 2, .LCPI11_1@toc@ha
-; GLOBAL-NEXT: xsabsdp 0, 1
-; GLOBAL-NEXT: lfs 2, .LCPI11_1@toc@l(3)
-; GLOBAL-NEXT: fcmpu 0, 0, 2
-; GLOBAL-NEXT: xxlxor 0, 0, 0
-; GLOBAL-NEXT: blt 0, .LBB11_2
-; GLOBAL-NEXT: # %bb.1:
; GLOBAL-NEXT: xsrsqrtesp 0, 1
; GLOBAL-NEXT: vspltisw 2, -3
; GLOBAL-NEXT: addis 3, 2, .LCPI11_0@toc@ha
-; GLOBAL-NEXT: xvcvsxwdp 2, 34
-; GLOBAL-NEXT: xsmulsp 1, 1, 0
-; GLOBAL-NEXT: xsmaddasp 2, 1, 0
+; GLOBAL-NEXT: xvcvsxwdp 3, 34
+; GLOBAL-NEXT: xsmulsp 2, 1, 0
+; GLOBAL-NEXT: xsabsdp 1, 1
+; GLOBAL-NEXT: xsmaddasp 3, 2, 0
; GLOBAL-NEXT: lfs 0, .LCPI11_0@toc@l(3)
-; GLOBAL-NEXT: xsmulsp 0, 1, 0
-; GLOBAL-NEXT: xsmulsp 0, 0, 2
-; GLOBAL-NEXT: .LBB11_2:
-; GLOBAL-NEXT: fmr 1, 0
+; GLOBAL-NEXT: addis 3, 2, .LCPI11_1@toc@ha
+; GLOBAL-NEXT: xsmulsp 0, 2, 0
+; GLOBAL-NEXT: lfs 2, .LCPI11_1@toc@l(3)
+; GLOBAL-NEXT: xssubsp 1, 1, 2
+; GLOBAL-NEXT: xxlxor 2, 2, 2
+; GLOBAL-NEXT: xsmulsp 0, 0, 3
+; GLOBAL-NEXT: fsel 1, 1, 0, 2
; GLOBAL-NEXT: blr
%rt = call afn ninf float @llvm.sqrt.f32(float %x)
ret float %rt
@@ -393,21 +390,19 @@ define float @sqrt_afn_preserve_sign(float %x) #1 {
;
; GLOBAL-LABEL: sqrt_afn_preserve_sign:
; GLOBAL: # %bb.0:
-; GLOBAL-NEXT: xxlxor 0, 0, 0
-; GLOBAL-NEXT: fcmpu 0, 1, 0
-; GLOBAL-NEXT: beq 0, .LBB13_2
-; GLOBAL-NEXT: # %bb.1:
; GLOBAL-NEXT: xsrsqrtesp 0, 1
; GLOBAL-NEXT: vspltisw 2, -3
; GLOBAL-NEXT: addis 3, 2, .LCPI13_0@toc@ha
-; GLOBAL-NEXT: xvcvsxwdp 2, 34
-; GLOBAL-NEXT: xsmulsp 1, 1, 0
-; GLOBAL-NEXT: xsmaddasp 2, 1, 0
+; GLOBAL-NEXT: xvcvsxwdp 3, 34
+; GLOBAL-NEXT: xsmulsp 2, 1, 0
+; GLOBAL-NEXT: xsmaddasp 3, 2, 0
; GLOBAL-NEXT: lfs 0, .LCPI13_0@toc@l(3)
-; GLOBAL-NEXT: xsmulsp 0, 1, 0
-; GLOBAL-NEXT: xsmulsp 0, 0, 2
-; GLOBAL-NEXT: .LBB13_2:
-; GLOBAL-NEXT: fmr 1, 0
+; GLOBAL-NEXT: xsmulsp 0, 2, 0
+; GLOBAL-NEXT: xxlxor 2, 2, 2
+; GLOBAL-NEXT: xsmulsp 0, 0, 3
+; GLOBAL-NEXT: fsel 2, 1, 2, 0
+; GLOBAL-NEXT: xsnegdp 1, 1
+; GLOBAL-NEXT: fsel 1, 1, 2, 0
; GLOBAL-NEXT: blr
%rt = call afn ninf float @llvm.sqrt.f32(float %x)
ret float %rt
@@ -462,24 +457,21 @@ define float @sqrt_fast_ieee(float %x) #0 {
;
; GLOBAL-LABEL: sqrt_fast_ieee:
; GLOBAL: # %bb.0:
-; GLOBAL-NEXT: addis 3, 2, .LCPI15_1@toc@ha
-; GLOBAL-NEXT: xsabsdp 0, 1
-; GLOBAL-NEXT: lfs 2, .LCPI15_1@toc@l(3)
-; GLOBAL-NEXT: fcmpu 0, 0, 2
-; GLOBAL-NEXT: xxlxor 0, 0, 0
-; GLOBAL-NEXT: blt 0, .LBB15_2
-; GLOBAL-NEXT: # %bb.1:
; GLOBAL-NEXT: xsrsqrtesp 0, 1
; GLOBAL-NEXT: vspltisw 2, -3
; GLOBAL-NEXT: addis 3, 2, .LCPI15_0@toc@ha
-; GLOBAL-NEXT: xvcvsxwdp 2, 34
-; GLOBAL-NEXT: xsmulsp 1, 1, 0
-; GLOBAL-NEXT: xsmaddasp 2, 1, 0
+; GLOBAL-NEXT: xvcvsxwdp 3, 34
+; GLOBAL-NEXT: xsmulsp 2, 1, 0
+; GLOBAL-NEXT: xsabsdp 1, 1
+; GLOBAL-NEXT: xsmaddasp 3, 2, 0
; GLOBAL-NEXT: lfs 0, .LCPI15_0@toc@l(3)
-; GLOBAL-NEXT: xsmulsp 0, 1, 0
-; GLOBAL-NEXT: xsmulsp 0, 0, 2
-; GLOBAL-NEXT: .LBB15_2:
-; GLOBAL-NEXT: fmr 1, 0
+; GLOBAL-NEXT: addis 3, 2, .LCPI15_1@toc@ha
+; GLOBAL-NEXT: xsmulsp 0, 2, 0
+; GLOBAL-NEXT: lfs 2, .LCPI15_1@toc@l(3)
+; GLOBAL-NEXT: xssubsp 1, 1, 2
+; GLOBAL-NEXT: xxlxor 2, 2, 2
+; GLOBAL-NEXT: xsmulsp 0, 0, 3
+; GLOBAL-NEXT: fsel 1, 1, 0, 2
; GLOBAL-NEXT: blr
%rt = call contract reassoc afn ninf float @llvm.sqrt.f32(float %x)
ret float %rt
@@ -517,21 +509,19 @@ define float @sqrt_fast_preserve_sign(float %x) #1 {
;
; GLOBAL-LABEL: sqrt_fast_preserve_sign:
; GLOBAL: # %bb.0:
-; GLOBAL-NEXT: xxlxor 0, 0, 0
-; GLOBAL-NEXT: fcmpu 0, 1, 0
-; GLOBAL-NEXT: beq 0, .LBB16_2
-; GLOBAL-NEXT: # %bb.1:
; GLOBAL-NEXT: xsrsqrtesp 0, 1
; GLOBAL-NEXT: vspltisw 2, -3
; GLOBAL-NEXT: addis 3, 2, .LCPI16_0@toc@ha
-; GLOBAL-NEXT: xvcvsxwdp 2, 34
-; GLOBAL-NEXT: xsmulsp 1, 1, 0
-; GLOBAL-NEXT: xsmaddasp 2, 1, 0
+; GLOBAL-NEXT: xvcvsxwdp 3, 34
+; GLOBAL-NEXT: xsmulsp 2, 1, 0
+; GLOBAL-NEXT: xsmaddasp 3, 2, 0
; GLOBAL-NEXT: lfs 0, .LCPI16_0@toc@l(3)
-; GLOBAL-NEXT: xsmulsp 0, 1, 0
-; GLOBAL-NEXT: xsmulsp 0, 0, 2
-; GLOBAL-NEXT: .LBB16_2:
-; GLOBAL-NEXT: fmr 1, 0
+; GLOBAL-NEXT: xsmulsp 0, 2, 0
+; GLOBAL-NEXT: xxlxor 2, 2, 2
+; GLOBAL-NEXT: xsmulsp 0, 0, 3
+; GLOBAL-NEXT: fsel 2, 1, 2, 0
+; GLOBAL-NEXT: xsnegdp 1, 1
+; GLOBAL-NEXT: fsel 1, 1, 2, 0
; GLOBAL-NEXT: blr
%rt = call contract reassoc ninf afn float @llvm.sqrt.f32(float %x)
ret float %rt
diff --git a/llvm/test/CodeGen/PowerPC/lxvkq-vec-constant.ll b/llvm/test/CodeGen/PowerPC/lxvkq-vec-constant.ll
new file mode 100644
index 0000000..0ee4524
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/lxvkq-vec-constant.ll
@@ -0,0 +1,307 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc64le-unknown-unknown \
+; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWERPC64-LE-10
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc64-unknown-unknown \
+; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWERPC64-BE-10
+
+; Test LXVKQ instruction generation for special vector constants matching 128 bit patterns:
+; 0x8000_0000_0000_0000_0000_0000_0000_0000 (MSB set pattern)
+; 0x0000_0000_0000_0000_0000_0000_0000_0001 (LSB set pattern)
+
+; =============================================================================
+; v2i64 tests - MSB set pattern (0x8000_0000_0000_0000_0000_0000_0000_0000)
+; =============================================================================
+
+; Big-Endian: 0x8000_0000_0000_0000_0000_0000_0000_0000 represents <-9223372036854775808, 0>
+define dso_local noundef <2 x i64> @test_v2i64_msb_set_bigendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v2i64_msb_set_bigendian:
+; POWERPC64-LE-10: # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT: plxv v2, .LCPI0_0@PCREL(0), 1
+; POWERPC64-LE-10-NEXT: blr
+;
+; POWERPC64-BE-10-LABEL: test_v2i64_msb_set_bigendian:
+; POWERPC64-BE-10: # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT: lxvkq v2, 16
+; POWERPC64-BE-10-NEXT: blr
+entry:
+ ret <2 x i64> <i64 -9223372036854775808, i64 0>
+}
+
+; Little-Endian: 0x8000_0000_0000_0000_0000_0000_0000_0000 represents <0, -9223372036854775808>
+define dso_local noundef <2 x i64> @test_v2i64_msb_set_littleendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v2i64_msb_set_littleendian:
+; POWERPC64-LE-10: # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT: lxvkq v2, 16
+; POWERPC64-LE-10-NEXT: blr
+;
+; POWERPC64-BE-10-LABEL: test_v2i64_msb_set_littleendian:
+; POWERPC64-BE-10: # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT: addis r3, r2, .LCPI1_0@toc@ha
+; POWERPC64-BE-10-NEXT: addi r3, r3, .LCPI1_0@toc@l
+; POWERPC64-BE-10-NEXT: lxv v2, 0(r3)
+; POWERPC64-BE-10-NEXT: blr
+entry:
+ ret <2 x i64> <i64 0, i64 -9223372036854775808>
+}
+
+; =============================================================================
+; v4i32 tests - MSB set pattern (0x8000_0000_0000_0000_0000_0000_0000_0000)
+; =============================================================================
+
+; Big-Endian: 0x8000_0000_0000_0000_0000_0000_0000_0000 represents <-2147483648, 0, 0, 0>
+define dso_local noundef <4 x i32> @test_v4i32_msb_set_bigendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v4i32_msb_set_bigendian:
+; POWERPC64-LE-10: # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT: plxv v2, .LCPI2_0@PCREL(0), 1
+; POWERPC64-LE-10-NEXT: blr
+;
+; POWERPC64-BE-10-LABEL: test_v4i32_msb_set_bigendian:
+; POWERPC64-BE-10: # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT: lxvkq v2, 16
+; POWERPC64-BE-10-NEXT: blr
+entry:
+ ret <4 x i32> <i32 -2147483648, i32 0, i32 0, i32 0>
+}
+
+; Little-Endian: 0x8000_0000_0000_0000_0000_0000_0000_0000 represents <0, 0, 0, -2147483648>
+define dso_local noundef <4 x i32> @test_v4i32_msb_set_littleendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v4i32_msb_set_littleendian:
+; POWERPC64-LE-10: # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT: lxvkq v2, 16
+; POWERPC64-LE-10-NEXT: blr
+;
+; POWERPC64-BE-10-LABEL: test_v4i32_msb_set_littleendian:
+; POWERPC64-BE-10: # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT: addis r3, r2, .LCPI3_0@toc@ha
+; POWERPC64-BE-10-NEXT: addi r3, r3, .LCPI3_0@toc@l
+; POWERPC64-BE-10-NEXT: lxv v2, 0(r3)
+; POWERPC64-BE-10-NEXT: blr
+entry:
+ ret <4 x i32> <i32 0, i32 0, i32 0, i32 -2147483648>
+}
+
+; =============================================================================
+; v8i16 tests - MSB set pattern (0x8000_0000_0000_0000_0000_0000_0000_0000)
+; =============================================================================
+
+; Big-Endian: 0x8000_0000_0000_0000_0000_0000_0000_0000 represents <-32768, 0, 0, 0, 0, 0, 0, 0>
+define dso_local noundef <8 x i16> @test_v8i16_msb_set_bigendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v8i16_msb_set_bigendian:
+; POWERPC64-LE-10: # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT: plxv v2, .LCPI4_0@PCREL(0), 1
+; POWERPC64-LE-10-NEXT: blr
+;
+; POWERPC64-BE-10-LABEL: test_v8i16_msb_set_bigendian:
+; POWERPC64-BE-10: # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT: lxvkq v2, 16
+; POWERPC64-BE-10-NEXT: blr
+entry:
+ ret <8 x i16> <i16 -32768, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+}
+
+; Little-Endian: 0x8000_0000_0000_0000_0000_0000_0000_0000 represents <0, 0, 0, 0, 0, 0, 0, -32768>
+define dso_local noundef <8 x i16> @test_v8i16_msb_set_littleendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v8i16_msb_set_littleendian:
+; POWERPC64-LE-10: # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT: lxvkq v2, 16
+; POWERPC64-LE-10-NEXT: blr
+;
+; POWERPC64-BE-10-LABEL: test_v8i16_msb_set_littleendian:
+; POWERPC64-BE-10: # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT: addis r3, r2, .LCPI5_0@toc@ha
+; POWERPC64-BE-10-NEXT: addi r3, r3, .LCPI5_0@toc@l
+; POWERPC64-BE-10-NEXT: lxv v2, 0(r3)
+; POWERPC64-BE-10-NEXT: blr
+entry:
+ ret <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 -32768>
+}
+
+; =============================================================================
+; v16i8 tests - MSB set pattern (0x8000_0000_0000_0000_0000_0000_0000_0000)
+; =============================================================================
+
+; Big-Endian: 0x8000_0000_0000_0000_0000_0000_0000_0000 represents <-128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>
+define dso_local noundef <16 x i8> @test_v16i8_msb_set_bigendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v16i8_msb_set_bigendian:
+; POWERPC64-LE-10: # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT: plxv v2, .LCPI6_0@PCREL(0), 1
+; POWERPC64-LE-10-NEXT: blr
+;
+; POWERPC64-BE-10-LABEL: test_v16i8_msb_set_bigendian:
+; POWERPC64-BE-10: # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT: lxvkq v2, 16
+; POWERPC64-BE-10-NEXT: blr
+entry:
+ ret <16 x i8> <i8 -128, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+}
+
+; Little-Endian: 0x8000_0000_0000_0000_0000_0000_0000_0000 represents <0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -128>
+define dso_local noundef <16 x i8> @test_v16i8_msb_set_littleendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v16i8_msb_set_littleendian:
+; POWERPC64-LE-10: # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT: lxvkq v2, 16
+; POWERPC64-LE-10-NEXT: blr
+;
+; POWERPC64-BE-10-LABEL: test_v16i8_msb_set_littleendian:
+; POWERPC64-BE-10: # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT: addis r3, r2, .LCPI7_0@toc@ha
+; POWERPC64-BE-10-NEXT: addi r3, r3, .LCPI7_0@toc@l
+; POWERPC64-BE-10-NEXT: lxv v2, 0(r3)
+; POWERPC64-BE-10-NEXT: blr
+entry:
+ ret <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 -128>
+}
+
+; =============================================================================
+; v2i64 tests - LSB set pattern (0x0000_0000_0000_0000_0000_0000_0000_0001)
+; =============================================================================
+
+; Big-Endian: 0x0000_0000_0000_0000_0000_0000_0000_0001 represents <0, 1>
+define dso_local noundef <2 x i64> @test_v2i64_lsb_set_bigendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v2i64_lsb_set_bigendian:
+; POWERPC64-LE-10: # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT: plxv v2, .LCPI8_0@PCREL(0), 1
+; POWERPC64-LE-10-NEXT: blr
+;
+; POWERPC64-BE-10-LABEL: test_v2i64_lsb_set_bigendian:
+; POWERPC64-BE-10: # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT: xxspltib v2, 255
+; POWERPC64-BE-10-NEXT: vsrq v2, v2, v2
+; POWERPC64-BE-10-NEXT: blr
+entry:
+ ret <2 x i64> <i64 0, i64 1>
+}
+
+; Little-Endian: 0x0000_0000_0000_0000_0000_0000_0000_0001 represents <1, 0>
+define dso_local noundef <2 x i64> @test_v2i64_lsb_set_littleendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v2i64_lsb_set_littleendian:
+; POWERPC64-LE-10: # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT: xxspltib v2, 255
+; POWERPC64-LE-10-NEXT: vsrq v2, v2, v2
+; POWERPC64-LE-10-NEXT: blr
+;
+; POWERPC64-BE-10-LABEL: test_v2i64_lsb_set_littleendian:
+; POWERPC64-BE-10: # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT: addis r3, r2, .LCPI9_0@toc@ha
+; POWERPC64-BE-10-NEXT: addi r3, r3, .LCPI9_0@toc@l
+; POWERPC64-BE-10-NEXT: lxv v2, 0(r3)
+; POWERPC64-BE-10-NEXT: blr
+entry:
+ ret <2 x i64> <i64 1, i64 0>
+}
+
+; =============================================================================
+; v4i32 tests - LSB set pattern (0x0000_0000_0000_0000_0000_0000_0000_0001)
+; =============================================================================
+
+; Big-Endian: 0x0000_0000_0000_0000_0000_0000_0000_0001 represents <0, 0, 0, 1>
+define dso_local noundef <4 x i32> @test_v4i32_lsb_set_bigendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v4i32_lsb_set_bigendian:
+; POWERPC64-LE-10: # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT: plxv v2, .LCPI10_0@PCREL(0), 1
+; POWERPC64-LE-10-NEXT: blr
+;
+; POWERPC64-BE-10-LABEL: test_v4i32_lsb_set_bigendian:
+; POWERPC64-BE-10: # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT: xxspltib v2, 255
+; POWERPC64-BE-10-NEXT: vsrq v2, v2, v2
+; POWERPC64-BE-10-NEXT: blr
+entry:
+ ret <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+}
+
+; Little-Endian: 0x0000_0000_0000_0000_0000_0000_0000_0001 represents <1, 0, 0, 0>
+define dso_local noundef <4 x i32> @test_v4i32_lsb_set_littleendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v4i32_lsb_set_littleendian:
+; POWERPC64-LE-10: # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT: xxspltib v2, 255
+; POWERPC64-LE-10-NEXT: vsrq v2, v2, v2
+; POWERPC64-LE-10-NEXT: blr
+;
+; POWERPC64-BE-10-LABEL: test_v4i32_lsb_set_littleendian:
+; POWERPC64-BE-10: # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT: addis r3, r2, .LCPI11_0@toc@ha
+; POWERPC64-BE-10-NEXT: addi r3, r3, .LCPI11_0@toc@l
+; POWERPC64-BE-10-NEXT: lxv v2, 0(r3)
+; POWERPC64-BE-10-NEXT: blr
+entry:
+ ret <4 x i32> <i32 1, i32 0, i32 0, i32 0>
+}
+
+; =============================================================================
+; v8i16 tests - LSB set pattern (0x0000_0000_0000_0000_0000_0000_0000_0001)
+; =============================================================================
+
+; Big-Endian: 0x0000_0000_0000_0000_0000_0000_0000_0001 represents <0, 0, 0, 0, 0, 0, 0, 1>
+define dso_local noundef <8 x i16> @test_v8i16_lsb_set_bigendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v8i16_lsb_set_bigendian:
+; POWERPC64-LE-10: # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT: plxv v2, .LCPI12_0@PCREL(0), 1
+; POWERPC64-LE-10-NEXT: blr
+;
+; POWERPC64-BE-10-LABEL: test_v8i16_lsb_set_bigendian:
+; POWERPC64-BE-10: # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT: xxspltib v2, 255
+; POWERPC64-BE-10-NEXT: vsrq v2, v2, v2
+; POWERPC64-BE-10-NEXT: blr
+entry:
+ ret <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 1>
+}
+
+; Little-Endian: 0x0000_0000_0000_0000_0000_0000_0000_0001 represents <1, 0, 0, 0, 0, 0, 0, 0>
+define dso_local noundef <8 x i16> @test_v8i16_lsb_set_littleendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v8i16_lsb_set_littleendian:
+; POWERPC64-LE-10: # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT: xxspltib v2, 255
+; POWERPC64-LE-10-NEXT: vsrq v2, v2, v2
+; POWERPC64-LE-10-NEXT: blr
+;
+; POWERPC64-BE-10-LABEL: test_v8i16_lsb_set_littleendian:
+; POWERPC64-BE-10: # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT: addis r3, r2, .LCPI13_0@toc@ha
+; POWERPC64-BE-10-NEXT: addi r3, r3, .LCPI13_0@toc@l
+; POWERPC64-BE-10-NEXT: lxv v2, 0(r3)
+; POWERPC64-BE-10-NEXT: blr
+entry:
+ ret <8 x i16> <i16 1, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+}
+
+; =============================================================================
+; v16i8 tests - LSB set pattern (0x0000_0000_0000_0000_0000_0000_0000_0001)
+; =============================================================================
+
+; Big-Endian: 0x0000_0000_0000_0000_0000_0000_0000_0001 represents <0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1>
+define dso_local noundef <16 x i8> @test_v16i8_lsb_set_bigendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v16i8_lsb_set_bigendian:
+; POWERPC64-LE-10: # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT: plxv v2, .LCPI14_0@PCREL(0), 1
+; POWERPC64-LE-10-NEXT: blr
+;
+; POWERPC64-BE-10-LABEL: test_v16i8_lsb_set_bigendian:
+; POWERPC64-BE-10: # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT: xxspltib v2, 255
+; POWERPC64-BE-10-NEXT: vsrq v2, v2, v2
+; POWERPC64-BE-10-NEXT: blr
+entry:
+ ret <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1>
+}
+
+; Little-Endian: 0x0000_0000_0000_0000_0000_0000_0000_0001 represents <1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>
+define dso_local noundef <16 x i8> @test_v16i8_lsb_set_littleendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v16i8_lsb_set_littleendian:
+; POWERPC64-LE-10: # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT: xxspltib v2, 255
+; POWERPC64-LE-10-NEXT: vsrq v2, v2, v2
+; POWERPC64-LE-10-NEXT: blr
+;
+; POWERPC64-BE-10-LABEL: test_v16i8_lsb_set_littleendian:
+; POWERPC64-BE-10: # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT: addis r3, r2, .LCPI15_0@toc@ha
+; POWERPC64-BE-10-NEXT: addi r3, r3, .LCPI15_0@toc@l
+; POWERPC64-BE-10-NEXT: lxv v2, 0(r3)
+; POWERPC64-BE-10-NEXT: blr
+entry:
+ ret <16 x i8> <i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+} \ No newline at end of file
diff --git a/llvm/test/CodeGen/PowerPC/vector-all-ones.ll b/llvm/test/CodeGen/PowerPC/vector-all-ones.ll
deleted file mode 100644
index e4c93adc..0000000
--- a/llvm/test/CodeGen/PowerPC/vector-all-ones.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu \
-; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
-
-; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc64-ibm-aix \
-; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
-
-; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc-ibm-aix \
-; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
-
-; Currently the generated code uses `vspltisw` to generate vector of 1s followed by add operation.
-; This pattern is expected to be optimized in a future patch by using `xxleqv` to generate vector of -1s
-; followed by subtraction operation.
-define dso_local noundef <4 x i32> @test1(<4 x i32> %a) {
-; CHECK-LABEL: test1:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vspltisw v3, 1
-; CHECK-NEXT: vadduwm v2, v2, v3
-; CHECK-NEXT: blr
-entry:
- %add = add <4 x i32> %a, splat (i32 1)
- ret <4 x i32> %add
-}
diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-add.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-add.ll
index 0892210..d506d20 100644
--- a/llvm/test/CodeGen/PowerPC/vector-reduce-add.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-reduce-add.ll
@@ -1566,12 +1566,16 @@ define dso_local i64 @v16i8tov16i64_sign(<16 x i8> %a) local_unnamed_addr #0 {
; PWR10BE-LABEL: v16i8tov16i64_sign:
; PWR10BE: # %bb.0: # %entry
; PWR10BE-NEXT: addis r3, r2, .LCPI23_0@toc@ha
+; PWR10BE-NEXT: xxspltib v1, 255
; PWR10BE-NEXT: addi r3, r3, .LCPI23_0@toc@l
+; PWR10BE-NEXT: vsrq v1, v1, v1
; PWR10BE-NEXT: lxv v3, 0(r3)
; PWR10BE-NEXT: addis r3, r2, .LCPI23_1@toc@ha
; PWR10BE-NEXT: addi r3, r3, .LCPI23_1@toc@l
+; PWR10BE-NEXT: vperm v1, v2, v2, v1
; PWR10BE-NEXT: lxv v4, 0(r3)
; PWR10BE-NEXT: addis r3, r2, .LCPI23_2@toc@ha
+; PWR10BE-NEXT: vextsb2d v1, v1
; PWR10BE-NEXT: vperm v3, v2, v2, v3
; PWR10BE-NEXT: addi r3, r3, .LCPI23_2@toc@l
; PWR10BE-NEXT: vextsb2d v3, v3
@@ -1585,23 +1589,18 @@ define dso_local i64 @v16i8tov16i64_sign(<16 x i8> %a) local_unnamed_addr #0 {
; PWR10BE-NEXT: vperm v5, v2, v2, v5
; PWR10BE-NEXT: addi r3, r3, .LCPI23_4@toc@l
; PWR10BE-NEXT: vextsb2d v5, v5
-; PWR10BE-NEXT: lxv v1, 0(r3)
+; PWR10BE-NEXT: lxv v6, 0(r3)
; PWR10BE-NEXT: addis r3, r2, .LCPI23_5@toc@ha
; PWR10BE-NEXT: vperm v0, v2, v2, v0
; PWR10BE-NEXT: addi r3, r3, .LCPI23_5@toc@l
; PWR10BE-NEXT: vextsb2d v0, v0
-; PWR10BE-NEXT: lxv v6, 0(r3)
+; PWR10BE-NEXT: lxv v7, 0(r3)
; PWR10BE-NEXT: addis r3, r2, .LCPI23_6@toc@ha
-; PWR10BE-NEXT: vperm v1, v2, v2, v1
+; PWR10BE-NEXT: vperm v6, v2, v2, v6
; PWR10BE-NEXT: vaddudm v5, v0, v5
; PWR10BE-NEXT: vaddudm v3, v4, v3
; PWR10BE-NEXT: vaddudm v3, v3, v5
; PWR10BE-NEXT: addi r3, r3, .LCPI23_6@toc@l
-; PWR10BE-NEXT: vextsb2d v1, v1
-; PWR10BE-NEXT: lxv v7, 0(r3)
-; PWR10BE-NEXT: addis r3, r2, .LCPI23_7@toc@ha
-; PWR10BE-NEXT: vperm v6, v2, v2, v6
-; PWR10BE-NEXT: addi r3, r3, .LCPI23_7@toc@l
; PWR10BE-NEXT: vextsb2d v6, v6
; PWR10BE-NEXT: lxv v8, 0(r3)
; PWR10BE-NEXT: vperm v7, v2, v2, v7
@@ -1609,7 +1608,7 @@ define dso_local i64 @v16i8tov16i64_sign(<16 x i8> %a) local_unnamed_addr #0 {
; PWR10BE-NEXT: vperm v2, v2, v2, v8
; PWR10BE-NEXT: vextsb2d v2, v2
; PWR10BE-NEXT: vaddudm v2, v2, v7
-; PWR10BE-NEXT: vaddudm v4, v6, v1
+; PWR10BE-NEXT: vaddudm v4, v1, v6
; PWR10BE-NEXT: vaddudm v2, v4, v2
; PWR10BE-NEXT: vaddudm v2, v2, v3
; PWR10BE-NEXT: xxswapd v3, v2
diff --git a/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-eqv.ll b/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-eqv.ll
index 24a1724..ba7680b 100644
--- a/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-eqv.ll
+++ b/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-eqv.ll
@@ -15,11 +15,9 @@ define <4 x i32> @ternary_A_or_BC_eqv_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i3
; CHECK-LABEL: ternary_A_or_BC_eqv_BC_4x32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxleqv v5, v5, v5
-; CHECK-NEXT: xxlor vs0, v3, v4
-; CHECK-NEXT: xxleqv vs1, v3, v4
; CHECK-NEXT: vslw v2, v2, v5
; CHECK-NEXT: vsraw v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 151
; CHECK-NEXT: blr
entry:
%or = or <4 x i32> %B, %C
@@ -34,12 +32,10 @@ define <2 x i64> @ternary_A_or_BC_eqv_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i6
; CHECK-LABEL: ternary_A_or_BC_eqv_BC_2x64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxlxor v5, v5, v5
-; CHECK-NEXT: xxlor vs0, v3, v4
-; CHECK-NEXT: xxleqv vs1, v3, v4
; CHECK-NEXT: xxsplti32dx v5, 1, 63
; CHECK-NEXT: vsld v2, v2, v5
; CHECK-NEXT: vsrad v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 151
; CHECK-NEXT: blr
entry:
%or = or <2 x i64> %B, %C
@@ -54,11 +50,9 @@ define <16 x i8> @ternary_A_or_BC_eqv_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x
; CHECK-LABEL: ternary_A_or_BC_eqv_BC_16x8:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltib v5, 7
-; CHECK-NEXT: xxlor vs0, v3, v4
-; CHECK-NEXT: xxleqv vs1, v3, v4
; CHECK-NEXT: vslb v2, v2, v5
; CHECK-NEXT: vsrab v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 151
; CHECK-NEXT: blr
entry:
%or = or <16 x i8> %B, %C
@@ -73,11 +67,9 @@ define <8 x i16> @ternary_A_or_BC_eqv_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i1
; CHECK-LABEL: ternary_A_or_BC_eqv_BC_8x16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltiw v5, 983055
-; CHECK-NEXT: xxlor vs0, v3, v4
-; CHECK-NEXT: xxleqv vs1, v3, v4
; CHECK-NEXT: vslh v2, v2, v5
; CHECK-NEXT: vsrah v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 151
; CHECK-NEXT: blr
entry:
%or = or <8 x i16> %B, %C
@@ -92,11 +84,9 @@ define <4 x i32> @ternary_A_nor_BC_eqv_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i
; CHECK-LABEL: ternary_A_nor_BC_eqv_BC_4x32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxleqv v5, v5, v5
-; CHECK-NEXT: xxlnor vs0, v3, v4
-; CHECK-NEXT: xxleqv vs1, v3, v4
; CHECK-NEXT: vslw v2, v2, v5
; CHECK-NEXT: vsraw v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 152
; CHECK-NEXT: blr
entry:
%or = or <4 x i32> %B, %C
@@ -112,12 +102,10 @@ define <2 x i64> @ternary_A_nor_BC_eqv_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i
; CHECK-LABEL: ternary_A_nor_BC_eqv_BC_2x64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxlxor v5, v5, v5
-; CHECK-NEXT: xxlnor vs0, v3, v4
-; CHECK-NEXT: xxleqv vs1, v3, v4
; CHECK-NEXT: xxsplti32dx v5, 1, 63
; CHECK-NEXT: vsld v2, v2, v5
; CHECK-NEXT: vsrad v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 152
; CHECK-NEXT: blr
entry:
%or = or <2 x i64> %B, %C
@@ -133,11 +121,9 @@ define <16 x i8> @ternary_A_nor_BC_eqv_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x
; CHECK-LABEL: ternary_A_nor_BC_eqv_BC_16x8:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltib v5, 7
-; CHECK-NEXT: xxlnor vs0, v3, v4
-; CHECK-NEXT: xxleqv vs1, v3, v4
; CHECK-NEXT: vslb v2, v2, v5
; CHECK-NEXT: vsrab v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 152
; CHECK-NEXT: blr
entry:
%or = or <16 x i8> %B, %C
@@ -153,11 +139,9 @@ define <8 x i16> @ternary_A_nor_BC_eqv_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i
; CHECK-LABEL: ternary_A_nor_BC_eqv_BC_8x16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltiw v5, 983055
-; CHECK-NEXT: xxlnor vs0, v3, v4
-; CHECK-NEXT: xxleqv vs1, v3, v4
; CHECK-NEXT: vslh v2, v2, v5
; CHECK-NEXT: vsrah v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 152
; CHECK-NEXT: blr
entry:
%or = or <8 x i16> %B, %C
@@ -173,10 +157,9 @@ define <4 x i32> @ternary_A_not_C_eqv_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i3
; CHECK-LABEL: ternary_A_not_C_eqv_BC_4x32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxleqv v5, v5, v5
-; CHECK-NEXT: xxlnor vs0, v4, v4
; CHECK-NEXT: vslw v2, v2, v5
; CHECK-NEXT: vsraw v2, v2, v5
-; CHECK-NEXT: xxeval v2, v2, vs0, v3, 99
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 154
; CHECK-NEXT: blr
entry:
%not = xor <4 x i32> %C, <i32 -1, i32 -1, i32 -1, i32 -1> ; Vector not operation
@@ -191,12 +174,10 @@ define <2 x i64> @ternary_A_not_C_eqv_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i6
; CHECK-LABEL: ternary_A_not_C_eqv_BC_2x64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxlxor v5, v5, v5
-; CHECK-NEXT: xxlnor vs0, v4, v4
-; CHECK-NEXT: xxleqv vs1, v4, v3
; CHECK-NEXT: xxsplti32dx v5, 1, 63
; CHECK-NEXT: vsld v2, v2, v5
; CHECK-NEXT: vsrad v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 154
; CHECK-NEXT: blr
entry:
%not = xor <2 x i64> %C, <i64 -1, i64 -1> ; Vector not operation
@@ -211,11 +192,9 @@ define <16 x i8> @ternary_A_not_C_eqv_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x
; CHECK-LABEL: ternary_A_not_C_eqv_BC_16x8:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltib v5, 7
-; CHECK-NEXT: xxlnor vs0, v4, v4
-; CHECK-NEXT: xxleqv vs1, v4, v3
; CHECK-NEXT: vslb v2, v2, v5
; CHECK-NEXT: vsrab v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 154
; CHECK-NEXT: blr
entry:
%not = xor <16 x i8> %C, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> ; Vector not operation
@@ -230,11 +209,9 @@ define <8 x i16> @ternary_A_not_C_eqv_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i1
; CHECK-LABEL: ternary_A_not_C_eqv_BC_8x16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltiw v5, 983055
-; CHECK-NEXT: xxlnor vs0, v4, v4
-; CHECK-NEXT: xxleqv vs1, v4, v3
; CHECK-NEXT: vslh v2, v2, v5
; CHECK-NEXT: vsrah v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 154
; CHECK-NEXT: blr
entry:
%not = xor <8 x i16> %C, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1> ; Vector not operation
@@ -249,11 +226,9 @@ define <4 x i32> @ternary_A_nand_BC_eqv_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x
; CHECK-LABEL: ternary_A_nand_BC_eqv_BC_4x32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxleqv v5, v5, v5
-; CHECK-NEXT: xxlnand vs0, v3, v4
-; CHECK-NEXT: xxleqv vs1, v3, v4
; CHECK-NEXT: vslw v2, v2, v5
; CHECK-NEXT: vsraw v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 158
; CHECK-NEXT: blr
entry:
%and = and <4 x i32> %B, %C
@@ -269,12 +244,10 @@ define <2 x i64> @ternary_A_nand_BC_eqv_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x
; CHECK-LABEL: ternary_A_nand_BC_eqv_BC_2x64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxlxor v5, v5, v5
-; CHECK-NEXT: xxlnand vs0, v3, v4
-; CHECK-NEXT: xxleqv vs1, v3, v4
; CHECK-NEXT: xxsplti32dx v5, 1, 63
; CHECK-NEXT: vsld v2, v2, v5
; CHECK-NEXT: vsrad v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 158
; CHECK-NEXT: blr
entry:
%and = and <2 x i64> %B, %C
@@ -290,11 +263,9 @@ define <16 x i8> @ternary_A_nand_BC_eqv_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16
; CHECK-LABEL: ternary_A_nand_BC_eqv_BC_16x8:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltib v5, 7
-; CHECK-NEXT: xxlnand vs0, v3, v4
-; CHECK-NEXT: xxleqv vs1, v3, v4
; CHECK-NEXT: vslb v2, v2, v5
; CHECK-NEXT: vsrab v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 158
; CHECK-NEXT: blr
entry:
%and = and <16 x i8> %B, %C
@@ -310,11 +281,9 @@ define <8 x i16> @ternary_A_nand_BC_eqv_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x
; CHECK-LABEL: ternary_A_nand_BC_eqv_BC_8x16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltiw v5, 983055
-; CHECK-NEXT: xxlnand vs0, v3, v4
-; CHECK-NEXT: xxleqv vs1, v3, v4
; CHECK-NEXT: vslh v2, v2, v5
; CHECK-NEXT: vsrah v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 158
; CHECK-NEXT: blr
entry:
%and = and <8 x i16> %B, %C
diff --git a/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-nand.ll b/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-nand.ll
index 7a6733d3..067b089 100644
--- a/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-nand.ll
+++ b/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-nand.ll
@@ -15,10 +15,9 @@ define <4 x i32> @ternary_A_B_nand_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32>
; CHECK-LABEL: ternary_A_B_nand_BC_4x32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxleqv v5, v5, v5
-; CHECK-NEXT: xxlnand vs0, v3, v4
; CHECK-NEXT: vslw v2, v2, v5
; CHECK-NEXT: vsraw v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs0, v3, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 227
; CHECK-NEXT: blr
entry:
%and = and <4 x i32> %B, %C
@@ -32,11 +31,10 @@ define <2 x i64> @ternary_A_B_nand_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64>
; CHECK-LABEL: ternary_A_B_nand_BC_2x64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxlxor v5, v5, v5
-; CHECK-NEXT: xxlnand vs0, v3, v4
; CHECK-NEXT: xxsplti32dx v5, 1, 63
; CHECK-NEXT: vsld v2, v2, v5
; CHECK-NEXT: vsrad v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs0, v3, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 227
; CHECK-NEXT: blr
entry:
%and = and <2 x i64> %B, %C
@@ -50,10 +48,9 @@ define <16 x i8> @ternary_A_B_nand_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i8>
; CHECK-LABEL: ternary_A_B_nand_BC_16x8:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltib v5, 7
-; CHECK-NEXT: xxlnand vs0, v3, v4
; CHECK-NEXT: vslb v2, v2, v5
; CHECK-NEXT: vsrab v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs0, v3, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 227
; CHECK-NEXT: blr
entry:
%and = and <16 x i8> %B, %C
@@ -67,10 +64,9 @@ define <8 x i16> @ternary_A_B_nand_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16>
; CHECK-LABEL: ternary_A_B_nand_BC_8x16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltiw v5, 983055
-; CHECK-NEXT: xxlnand vs0, v3, v4
; CHECK-NEXT: vslh v2, v2, v5
; CHECK-NEXT: vsrah v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs0, v3, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 227
; CHECK-NEXT: blr
entry:
%and = and <8 x i16> %B, %C
@@ -84,10 +80,9 @@ define <4 x i32> @ternary_A_C_nand_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32>
; CHECK-LABEL: ternary_A_C_nand_BC_4x32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxleqv v5, v5, v5
-; CHECK-NEXT: xxlnand vs0, v3, v4
; CHECK-NEXT: vslw v2, v2, v5
; CHECK-NEXT: vsraw v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs0, v4, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 229
; CHECK-NEXT: blr
entry:
%and = and <4 x i32> %B, %C
@@ -101,11 +96,10 @@ define <2 x i64> @ternary_A_C_nand_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64>
; CHECK-LABEL: ternary_A_C_nand_BC_2x64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxlxor v5, v5, v5
-; CHECK-NEXT: xxlnand vs0, v3, v4
; CHECK-NEXT: xxsplti32dx v5, 1, 63
; CHECK-NEXT: vsld v2, v2, v5
; CHECK-NEXT: vsrad v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs0, v4, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 229
; CHECK-NEXT: blr
entry:
%and = and <2 x i64> %B, %C
@@ -119,10 +113,9 @@ define <16 x i8> @ternary_A_C_nand_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i8>
; CHECK-LABEL: ternary_A_C_nand_BC_16x8:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltib v5, 7
-; CHECK-NEXT: xxlnand vs0, v3, v4
; CHECK-NEXT: vslb v2, v2, v5
; CHECK-NEXT: vsrab v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs0, v4, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 229
; CHECK-NEXT: blr
entry:
%and = and <16 x i8> %B, %C
@@ -136,10 +129,9 @@ define <8 x i16> @ternary_A_C_nand_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16>
; CHECK-LABEL: ternary_A_C_nand_BC_8x16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltiw v5, 983055
-; CHECK-NEXT: xxlnand vs0, v3, v4
; CHECK-NEXT: vslh v2, v2, v5
; CHECK-NEXT: vsrah v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs0, v4, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 229
; CHECK-NEXT: blr
entry:
%and = and <8 x i16> %B, %C
@@ -153,11 +145,9 @@ define <4 x i32> @ternary_A_xor_BC_nand_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x
; CHECK-LABEL: ternary_A_xor_BC_nand_BC_4x32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxleqv v5, v5, v5
-; CHECK-NEXT: xxlxor vs0, v3, v4
-; CHECK-NEXT: xxlnand vs1, v3, v4
; CHECK-NEXT: vslw v2, v2, v5
; CHECK-NEXT: vsraw v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 230
; CHECK-NEXT: blr
entry:
%xor = xor <4 x i32> %B, %C
@@ -172,12 +162,10 @@ define <2 x i64> @ternary_A_xor_BC_nand_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x
; CHECK-LABEL: ternary_A_xor_BC_nand_BC_2x64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxlxor v5, v5, v5
-; CHECK-NEXT: xxlxor vs0, v3, v4
-; CHECK-NEXT: xxlnand vs1, v3, v4
; CHECK-NEXT: xxsplti32dx v5, 1, 63
; CHECK-NEXT: vsld v2, v2, v5
; CHECK-NEXT: vsrad v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 230
; CHECK-NEXT: blr
entry:
%xor = xor <2 x i64> %B, %C
@@ -192,11 +180,9 @@ define <16 x i8> @ternary_A_xor_BC_nand_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16
; CHECK-LABEL: ternary_A_xor_BC_nand_BC_16x8:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltib v5, 7
-; CHECK-NEXT: xxlxor vs0, v3, v4
-; CHECK-NEXT: xxlnand vs1, v3, v4
; CHECK-NEXT: vslb v2, v2, v5
; CHECK-NEXT: vsrab v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 230
; CHECK-NEXT: blr
entry:
%xor = xor <16 x i8> %B, %C
@@ -211,11 +197,9 @@ define <8 x i16> @ternary_A_xor_BC_nand_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x
; CHECK-LABEL: ternary_A_xor_BC_nand_BC_8x16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltiw v5, 983055
-; CHECK-NEXT: xxlxor vs0, v3, v4
-; CHECK-NEXT: xxlnand vs1, v3, v4
; CHECK-NEXT: vslh v2, v2, v5
; CHECK-NEXT: vsrah v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 230
; CHECK-NEXT: blr
entry:
%xor = xor <8 x i16> %B, %C
@@ -230,11 +214,9 @@ define <4 x i32> @ternary_A_or_BC_nand_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i
; CHECK-LABEL: ternary_A_or_BC_nand_BC_4x32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxleqv v5, v5, v5
-; CHECK-NEXT: xxlor vs0, v3, v4
-; CHECK-NEXT: xxlnand vs1, v3, v4
; CHECK-NEXT: vslw v2, v2, v5
; CHECK-NEXT: vsraw v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 231
; CHECK-NEXT: blr
entry:
%or = or <4 x i32> %B, %C
@@ -249,12 +231,10 @@ define <2 x i64> @ternary_A_or_BC_nand_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i
; CHECK-LABEL: ternary_A_or_BC_nand_BC_2x64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxlxor v5, v5, v5
-; CHECK-NEXT: xxlor vs0, v3, v4
-; CHECK-NEXT: xxlnand vs1, v3, v4
; CHECK-NEXT: xxsplti32dx v5, 1, 63
; CHECK-NEXT: vsld v2, v2, v5
; CHECK-NEXT: vsrad v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 231
; CHECK-NEXT: blr
entry:
%or = or <2 x i64> %B, %C
@@ -269,11 +249,9 @@ define <16 x i8> @ternary_A_or_BC_nand_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x
; CHECK-LABEL: ternary_A_or_BC_nand_BC_16x8:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltib v5, 7
-; CHECK-NEXT: xxlor vs0, v3, v4
-; CHECK-NEXT: xxlnand vs1, v3, v4
; CHECK-NEXT: vslb v2, v2, v5
; CHECK-NEXT: vsrab v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 231
; CHECK-NEXT: blr
entry:
%or = or <16 x i8> %B, %C
@@ -288,11 +266,9 @@ define <8 x i16> @ternary_A_or_BC_nand_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i
; CHECK-LABEL: ternary_A_or_BC_nand_BC_8x16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltiw v5, 983055
-; CHECK-NEXT: xxlor vs0, v3, v4
-; CHECK-NEXT: xxlnand vs1, v3, v4
; CHECK-NEXT: vslh v2, v2, v5
; CHECK-NEXT: vsrah v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 231
; CHECK-NEXT: blr
entry:
%or = or <8 x i16> %B, %C
@@ -307,11 +283,9 @@ define <4 x i32> @ternary_A_eqv_BC_nand_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x
; CHECK-LABEL: ternary_A_eqv_BC_nand_BC_4x32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxleqv v5, v5, v5
-; CHECK-NEXT: xxleqv vs0, v3, v4
-; CHECK-NEXT: xxlnand vs1, v3, v4
; CHECK-NEXT: vslw v2, v2, v5
; CHECK-NEXT: vsraw v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 233
; CHECK-NEXT: blr
entry:
%xor = xor <4 x i32> %B, %C
@@ -327,12 +301,10 @@ define <2 x i64> @ternary_A_eqv_BC_nand_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x
; CHECK-LABEL: ternary_A_eqv_BC_nand_BC_2x64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxlxor v5, v5, v5
-; CHECK-NEXT: xxleqv vs0, v3, v4
-; CHECK-NEXT: xxlnand vs1, v3, v4
; CHECK-NEXT: xxsplti32dx v5, 1, 63
; CHECK-NEXT: vsld v2, v2, v5
; CHECK-NEXT: vsrad v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 233
; CHECK-NEXT: blr
entry:
%xor = xor <2 x i64> %B, %C
@@ -348,11 +320,9 @@ define <16 x i8> @ternary_A_eqv_BC_nand_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16
; CHECK-LABEL: ternary_A_eqv_BC_nand_BC_16x8:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltib v5, 7
-; CHECK-NEXT: xxleqv vs0, v3, v4
-; CHECK-NEXT: xxlnand vs1, v3, v4
; CHECK-NEXT: vslb v2, v2, v5
; CHECK-NEXT: vsrab v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 233
; CHECK-NEXT: blr
entry:
%xor = xor <16 x i8> %B, %C
@@ -368,11 +338,9 @@ define <8 x i16> @ternary_A_eqv_BC_nand_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x
; CHECK-LABEL: ternary_A_eqv_BC_nand_BC_8x16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltiw v5, 983055
-; CHECK-NEXT: xxleqv vs0, v3, v4
-; CHECK-NEXT: xxlnand vs1, v3, v4
; CHECK-NEXT: vslh v2, v2, v5
; CHECK-NEXT: vsrah v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 233
; CHECK-NEXT: blr
entry:
%xor = xor <8 x i16> %B, %C
diff --git a/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-nor.ll b/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-nor.ll
index d635952..3695874 100644
--- a/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-nor.ll
+++ b/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-nor.ll
@@ -15,11 +15,9 @@ define <4 x i32> @ternary_A_and_BC_nor_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i
; CHECK-LABEL: ternary_A_and_BC_nor_BC_4x32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxleqv v5, v5, v5
-; CHECK-NEXT: xxland vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v3, v4
; CHECK-NEXT: vslw v2, v2, v5
; CHECK-NEXT: vsraw v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 129
; CHECK-NEXT: blr
entry:
%and = and <4 x i32> %B, %C
@@ -34,12 +32,10 @@ define <2 x i64> @ternary_A_and_BC_nor_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i
; CHECK-LABEL: ternary_A_and_BC_nor_BC_2x64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxlxor v5, v5, v5
-; CHECK-NEXT: xxland vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v3, v4
; CHECK-NEXT: xxsplti32dx v5, 1, 63
; CHECK-NEXT: vsld v2, v2, v5
; CHECK-NEXT: vsrad v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 129
; CHECK-NEXT: blr
entry:
%and = and <2 x i64> %B, %C
@@ -54,11 +50,9 @@ define <16 x i8> @ternary_A_and_BC_nor_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x
; CHECK-LABEL: ternary_A_and_BC_nor_BC_16x8:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltib v5, 7
-; CHECK-NEXT: xxland vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v3, v4
; CHECK-NEXT: vslb v2, v2, v5
; CHECK-NEXT: vsrab v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 129
; CHECK-NEXT: blr
entry:
%and = and <16 x i8> %B, %C
@@ -73,11 +67,9 @@ define <8 x i16> @ternary_A_and_BC_nor_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i
; CHECK-LABEL: ternary_A_and_BC_nor_BC_8x16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltiw v5, 983055
-; CHECK-NEXT: xxland vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v3, v4
; CHECK-NEXT: vslh v2, v2, v5
; CHECK-NEXT: vsrah v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 129
; CHECK-NEXT: blr
entry:
%and = and <8 x i16> %B, %C
@@ -92,10 +84,9 @@ define <4 x i32> @ternary_A_B_nor_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %
; CHECK-LABEL: ternary_A_B_nor_BC_4x32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxleqv v5, v5, v5
-; CHECK-NEXT: xxlnor vs0, v3, v4
; CHECK-NEXT: vslw v2, v2, v5
; CHECK-NEXT: vsraw v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs0, v3, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 131
; CHECK-NEXT: blr
entry:
%or = or <4 x i32> %B, %C
@@ -109,11 +100,10 @@ define <2 x i64> @ternary_A_B_nor_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %
; CHECK-LABEL: ternary_A_B_nor_BC_2x64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxlxor v5, v5, v5
-; CHECK-NEXT: xxlnor vs0, v3, v4
; CHECK-NEXT: xxsplti32dx v5, 1, 63
; CHECK-NEXT: vsld v2, v2, v5
; CHECK-NEXT: vsrad v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs0, v3, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 131
; CHECK-NEXT: blr
entry:
%or = or <2 x i64> %B, %C
@@ -127,10 +117,9 @@ define <16 x i8> @ternary_A_B_nor_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i8>
; CHECK-LABEL: ternary_A_B_nor_BC_16x8:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltib v5, 7
-; CHECK-NEXT: xxlnor vs0, v3, v4
; CHECK-NEXT: vslb v2, v2, v5
; CHECK-NEXT: vsrab v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs0, v3, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 131
; CHECK-NEXT: blr
entry:
%or = or <16 x i8> %B, %C
@@ -144,10 +133,9 @@ define <8 x i16> @ternary_A_B_nor_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16> %
; CHECK-LABEL: ternary_A_B_nor_BC_8x16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltiw v5, 983055
-; CHECK-NEXT: xxlnor vs0, v3, v4
; CHECK-NEXT: vslh v2, v2, v5
; CHECK-NEXT: vsrah v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs0, v3, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 131
; CHECK-NEXT: blr
entry:
%or = or <8 x i16> %B, %C
@@ -161,10 +149,9 @@ define <4 x i32> @ternary_A_C_nor_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %
; CHECK-LABEL: ternary_A_C_nor_BC_4x32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxleqv v5, v5, v5
-; CHECK-NEXT: xxlnor vs0, v3, v4
; CHECK-NEXT: vslw v2, v2, v5
; CHECK-NEXT: vsraw v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs0, v4, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 133
; CHECK-NEXT: blr
entry:
%or = or <4 x i32> %B, %C
@@ -178,11 +165,10 @@ define <2 x i64> @ternary_A_C_nor_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %
; CHECK-LABEL: ternary_A_C_nor_BC_2x64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxlxor v5, v5, v5
-; CHECK-NEXT: xxlnor vs0, v3, v4
; CHECK-NEXT: xxsplti32dx v5, 1, 63
; CHECK-NEXT: vsld v2, v2, v5
; CHECK-NEXT: vsrad v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs0, v4, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 133
; CHECK-NEXT: blr
entry:
%or = or <2 x i64> %B, %C
@@ -196,10 +182,9 @@ define <16 x i8> @ternary_A_C_nor_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i8>
; CHECK-LABEL: ternary_A_C_nor_BC_16x8:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltib v5, 7
-; CHECK-NEXT: xxlnor vs0, v3, v4
; CHECK-NEXT: vslb v2, v2, v5
; CHECK-NEXT: vsrab v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs0, v4, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 133
; CHECK-NEXT: blr
entry:
%or = or <16 x i8> %B, %C
@@ -213,10 +198,9 @@ define <8 x i16> @ternary_A_C_nor_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16> %
; CHECK-LABEL: ternary_A_C_nor_BC_8x16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltiw v5, 983055
-; CHECK-NEXT: xxlnor vs0, v3, v4
; CHECK-NEXT: vslh v2, v2, v5
; CHECK-NEXT: vsrah v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs0, v4, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 133
; CHECK-NEXT: blr
entry:
%or = or <8 x i16> %B, %C
@@ -230,11 +214,9 @@ define <4 x i32> @ternary_A_xor_BC_nor_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i
; CHECK-LABEL: ternary_A_xor_BC_nor_BC_4x32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxleqv v5, v5, v5
-; CHECK-NEXT: xxlxor vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v3, v4
; CHECK-NEXT: vslw v2, v2, v5
; CHECK-NEXT: vsraw v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 134
; CHECK-NEXT: blr
entry:
%xor = xor <4 x i32> %B, %C
@@ -249,12 +231,10 @@ define <2 x i64> @ternary_A_xor_BC_nor_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i
; CHECK-LABEL: ternary_A_xor_BC_nor_BC_2x64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxlxor v5, v5, v5
-; CHECK-NEXT: xxlxor vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v3, v4
; CHECK-NEXT: xxsplti32dx v5, 1, 63
; CHECK-NEXT: vsld v2, v2, v5
; CHECK-NEXT: vsrad v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 134
; CHECK-NEXT: blr
entry:
%xor = xor <2 x i64> %B, %C
@@ -269,11 +249,9 @@ define <16 x i8> @ternary_A_xor_BC_nor_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x
; CHECK-LABEL: ternary_A_xor_BC_nor_BC_16x8:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltib v5, 7
-; CHECK-NEXT: xxlxor vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v3, v4
; CHECK-NEXT: vslb v2, v2, v5
; CHECK-NEXT: vsrab v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 134
; CHECK-NEXT: blr
entry:
%xor = xor <16 x i8> %B, %C
@@ -288,11 +266,9 @@ define <8 x i16> @ternary_A_xor_BC_nor_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i
; CHECK-LABEL: ternary_A_xor_BC_nor_BC_8x16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltiw v5, 983055
-; CHECK-NEXT: xxlxor vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v3, v4
; CHECK-NEXT: vslh v2, v2, v5
; CHECK-NEXT: vsrah v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 134
; CHECK-NEXT: blr
entry:
%xor = xor <8 x i16> %B, %C
@@ -307,11 +283,9 @@ define <4 x i32> @ternary_A_not_C_nor_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i3
; CHECK-LABEL: ternary_A_not_C_nor_BC_4x32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxleqv v5, v5, v5
-; CHECK-NEXT: xxlnor vs0, v4, v4
-; CHECK-NEXT: xxlnor vs1, v3, v4
; CHECK-NEXT: vslw v2, v2, v5
; CHECK-NEXT: vsraw v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 138
; CHECK-NEXT: blr
entry:
%not = xor <4 x i32> %C, <i32 -1, i32 -1, i32 -1, i32 -1> ; Vector not operation
@@ -326,12 +300,10 @@ define <2 x i64> @ternary_A_not_C_nor_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i6
; CHECK-LABEL: ternary_A_not_C_nor_BC_2x64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxlxor v5, v5, v5
-; CHECK-NEXT: xxlnor vs0, v4, v4
-; CHECK-NEXT: xxlnor vs1, v3, v4
; CHECK-NEXT: xxsplti32dx v5, 1, 63
; CHECK-NEXT: vsld v2, v2, v5
; CHECK-NEXT: vsrad v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 138
; CHECK-NEXT: blr
entry:
%not = xor <2 x i64> %C, <i64 -1, i64 -1> ; Vector not operation
@@ -346,11 +318,9 @@ define <16 x i8> @ternary_A_not_C_nor_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x
; CHECK-LABEL: ternary_A_not_C_nor_BC_16x8:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltib v5, 7
-; CHECK-NEXT: xxlnor vs0, v4, v4
-; CHECK-NEXT: xxlnor vs1, v3, v4
; CHECK-NEXT: vslb v2, v2, v5
; CHECK-NEXT: vsrab v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 138
; CHECK-NEXT: blr
entry:
%not = xor <16 x i8> %C, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> ; Vector not operation
@@ -365,11 +335,9 @@ define <8 x i16> @ternary_A_not_C_nor_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i1
; CHECK-LABEL: ternary_A_not_C_nor_BC_8x16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltiw v5, 983055
-; CHECK-NEXT: xxlnor vs0, v4, v4
-; CHECK-NEXT: xxlnor vs1, v3, v4
; CHECK-NEXT: vslh v2, v2, v5
; CHECK-NEXT: vsrah v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 138
; CHECK-NEXT: blr
entry:
%not = xor <8 x i16> %C, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1> ; Vector not operation
@@ -384,11 +352,9 @@ define <4 x i32> @ternary_A_not_B_nor_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i3
; CHECK-LABEL: ternary_A_not_B_nor_BC_4x32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxleqv v5, v5, v5
-; CHECK-NEXT: xxlnor vs0, v3, v3
-; CHECK-NEXT: xxlnor vs1, v3, v4
; CHECK-NEXT: vslw v2, v2, v5
; CHECK-NEXT: vsraw v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 140
; CHECK-NEXT: blr
entry:
%not = xor <4 x i32> %B, <i32 -1, i32 -1, i32 -1, i32 -1> ; Vector not operation
@@ -403,12 +369,10 @@ define <2 x i64> @ternary_A_not_B_nor_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i6
; CHECK-LABEL: ternary_A_not_B_nor_BC_2x64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxlxor v5, v5, v5
-; CHECK-NEXT: xxlnor vs0, v3, v3
-; CHECK-NEXT: xxlnor vs1, v3, v4
; CHECK-NEXT: xxsplti32dx v5, 1, 63
; CHECK-NEXT: vsld v2, v2, v5
; CHECK-NEXT: vsrad v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 140
; CHECK-NEXT: blr
entry:
%not = xor <2 x i64> %B, <i64 -1, i64 -1> ; Vector not operation
@@ -423,11 +387,9 @@ define <16 x i8> @ternary_A_not_B_nor_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x
; CHECK-LABEL: ternary_A_not_B_nor_BC_16x8:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltib v5, 7
-; CHECK-NEXT: xxlnor vs0, v3, v3
-; CHECK-NEXT: xxlnor vs1, v3, v4
; CHECK-NEXT: vslb v2, v2, v5
; CHECK-NEXT: vsrab v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 140
; CHECK-NEXT: blr
entry:
%not = xor <16 x i8> %B, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> ; Vector not operation
@@ -442,11 +404,9 @@ define <8 x i16> @ternary_A_not_B_nor_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i1
; CHECK-LABEL: ternary_A_not_B_nor_BC_8x16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltiw v5, 983055
-; CHECK-NEXT: xxlnor vs0, v3, v3
-; CHECK-NEXT: xxlnor vs1, v3, v4
; CHECK-NEXT: vslh v2, v2, v5
; CHECK-NEXT: vsrah v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 140
; CHECK-NEXT: blr
entry:
%not = xor <8 x i16> %B, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1> ; Vector not operation
@@ -461,11 +421,9 @@ define <4 x i32> @ternary_A_nand_BC_nor_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x
; CHECK-LABEL: ternary_A_nand_BC_nor_BC_4x32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxleqv v5, v5, v5
-; CHECK-NEXT: xxlnand vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v3, v4
; CHECK-NEXT: vslw v2, v2, v5
; CHECK-NEXT: vsraw v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 142
; CHECK-NEXT: blr
entry:
%and = and <4 x i32> %B, %C
@@ -481,12 +439,10 @@ define <2 x i64> @ternary_A_nand_BC_nor_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x
; CHECK-LABEL: ternary_A_nand_BC_nor_BC_2x64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxlxor v5, v5, v5
-; CHECK-NEXT: xxlnand vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v3, v4
; CHECK-NEXT: xxsplti32dx v5, 1, 63
; CHECK-NEXT: vsld v2, v2, v5
; CHECK-NEXT: vsrad v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 142
; CHECK-NEXT: blr
entry:
%and = and <2 x i64> %B, %C
@@ -502,11 +458,9 @@ define <16 x i8> @ternary_A_nand_BC_nor_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16
; CHECK-LABEL: ternary_A_nand_BC_nor_BC_16x8:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltib v5, 7
-; CHECK-NEXT: xxlnand vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v3, v4
; CHECK-NEXT: vslb v2, v2, v5
; CHECK-NEXT: vsrab v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 142
; CHECK-NEXT: blr
entry:
%and = and <16 x i8> %B, %C
@@ -522,11 +476,9 @@ define <8 x i16> @ternary_A_nand_BC_nor_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x
; CHECK-LABEL: ternary_A_nand_BC_nor_BC_8x16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltiw v5, 983055
-; CHECK-NEXT: xxlnand vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v3, v4
; CHECK-NEXT: vslh v2, v2, v5
; CHECK-NEXT: vsrah v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 142
; CHECK-NEXT: blr
entry:
%and = and <8 x i16> %B, %C
diff --git a/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-not-b.ll b/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-not-b.ll
index 6203a96..a67d9cf 100644
--- a/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-not-b.ll
+++ b/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-not-b.ll
@@ -15,11 +15,9 @@ define <4 x i32> @ternary_A_and_BC_not_B_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i3
; CHECK-LABEL: ternary_A_and_BC_not_B_4x32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxleqv v5, v5, v5
-; CHECK-NEXT: xxland vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v3, v3
; CHECK-NEXT: vslw v2, v2, v5
; CHECK-NEXT: vsraw v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 193
; CHECK-NEXT: blr
entry:
%and = and <4 x i32> %B, %C
@@ -33,12 +31,10 @@ define <2 x i64> @ternary_A_and_BC_not_B_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i6
; CHECK-LABEL: ternary_A_and_BC_not_B_2x64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxlxor v5, v5, v5
-; CHECK-NEXT: xxland vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v3, v3
; CHECK-NEXT: xxsplti32dx v5, 1, 63
; CHECK-NEXT: vsld v2, v2, v5
; CHECK-NEXT: vsrad v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 193
; CHECK-NEXT: blr
entry:
%and = and <2 x i64> %B, %C
@@ -52,11 +48,9 @@ define <16 x i8> @ternary_A_and_BC_not_B_16x8(<16 x i1> %A, <16 x i8> %B, <16 x
; CHECK-LABEL: ternary_A_and_BC_not_B_16x8:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltib v5, 7
-; CHECK-NEXT: xxland vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v3, v3
; CHECK-NEXT: vslb v2, v2, v5
; CHECK-NEXT: vsrab v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 193
; CHECK-NEXT: blr
entry:
%and = and <16 x i8> %B, %C
@@ -70,11 +64,9 @@ define <8 x i16> @ternary_A_and_BC_not_B_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i1
; CHECK-LABEL: ternary_A_and_BC_not_B_8x16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltiw v5, 983055
-; CHECK-NEXT: xxland vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v3, v3
; CHECK-NEXT: vslh v2, v2, v5
; CHECK-NEXT: vsrah v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 193
; CHECK-NEXT: blr
entry:
%and = and <8 x i16> %B, %C
@@ -88,11 +80,9 @@ define <4 x i32> @ternary_A_xor_BC_not_B_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i3
; CHECK-LABEL: ternary_A_xor_BC_not_B_4x32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxleqv v5, v5, v5
-; CHECK-NEXT: xxlxor vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v3, v3
; CHECK-NEXT: vslw v2, v2, v5
; CHECK-NEXT: vsraw v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 198
; CHECK-NEXT: blr
entry:
%xor = xor <4 x i32> %B, %C
@@ -106,12 +96,10 @@ define <2 x i64> @ternary_A_xor_BC_not_B_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i6
; CHECK-LABEL: ternary_A_xor_BC_not_B_2x64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxlxor v5, v5, v5
-; CHECK-NEXT: xxlxor vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v3, v3
; CHECK-NEXT: xxsplti32dx v5, 1, 63
; CHECK-NEXT: vsld v2, v2, v5
; CHECK-NEXT: vsrad v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 198
; CHECK-NEXT: blr
entry:
%xor = xor <2 x i64> %B, %C
@@ -125,11 +113,9 @@ define <16 x i8> @ternary_A_xor_BC_not_B_16x8(<16 x i1> %A, <16 x i8> %B, <16 x
; CHECK-LABEL: ternary_A_xor_BC_not_B_16x8:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltib v5, 7
-; CHECK-NEXT: xxlxor vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v3, v3
; CHECK-NEXT: vslb v2, v2, v5
; CHECK-NEXT: vsrab v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 198
; CHECK-NEXT: blr
entry:
%xor = xor <16 x i8> %B, %C
@@ -143,11 +129,9 @@ define <8 x i16> @ternary_A_xor_BC_not_B_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i1
; CHECK-LABEL: ternary_A_xor_BC_not_B_8x16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltiw v5, 983055
-; CHECK-NEXT: xxlxor vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v3, v3
; CHECK-NEXT: vslh v2, v2, v5
; CHECK-NEXT: vsrah v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 198
; CHECK-NEXT: blr
entry:
%xor = xor <8 x i16> %B, %C
@@ -161,11 +145,9 @@ define <4 x i32> @ternary_A_or_BC_not_B_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32
; CHECK-LABEL: ternary_A_or_BC_not_B_4x32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxleqv v5, v5, v5
-; CHECK-NEXT: xxlor vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v3, v3
; CHECK-NEXT: vslw v2, v2, v5
; CHECK-NEXT: vsraw v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 199
; CHECK-NEXT: blr
entry:
%or = or <4 x i32> %B, %C
@@ -179,12 +161,10 @@ define <2 x i64> @ternary_A_or_BC_not_B_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64
; CHECK-LABEL: ternary_A_or_BC_not_B_2x64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxlxor v5, v5, v5
-; CHECK-NEXT: xxlor vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v3, v3
; CHECK-NEXT: xxsplti32dx v5, 1, 63
; CHECK-NEXT: vsld v2, v2, v5
; CHECK-NEXT: vsrad v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 199
; CHECK-NEXT: blr
entry:
%or = or <2 x i64> %B, %C
@@ -198,11 +178,9 @@ define <16 x i8> @ternary_A_or_BC_not_B_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i
; CHECK-LABEL: ternary_A_or_BC_not_B_16x8:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltib v5, 7
-; CHECK-NEXT: xxlor vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v3, v3
; CHECK-NEXT: vslb v2, v2, v5
; CHECK-NEXT: vsrab v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 199
; CHECK-NEXT: blr
entry:
%or = or <16 x i8> %B, %C
@@ -216,11 +194,9 @@ define <8 x i16> @ternary_A_or_BC_not_B_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16
; CHECK-LABEL: ternary_A_or_BC_not_B_8x16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltiw v5, 983055
-; CHECK-NEXT: xxlor vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v3, v3
; CHECK-NEXT: vslh v2, v2, v5
; CHECK-NEXT: vsrah v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 199
; CHECK-NEXT: blr
entry:
%or = or <8 x i16> %B, %C
@@ -234,11 +210,9 @@ define <4 x i32> @ternary_A_nand_BC_not_B_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i
; CHECK-LABEL: ternary_A_nand_BC_not_B_4x32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxleqv v5, v5, v5
-; CHECK-NEXT: xxlnand vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v3, v3
; CHECK-NEXT: vslw v2, v2, v5
; CHECK-NEXT: vsraw v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 206
; CHECK-NEXT: blr
entry:
%and = and <4 x i32> %B, %C
@@ -253,12 +227,10 @@ define <2 x i64> @ternary_A_nand_BC_not_B_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i
; CHECK-LABEL: ternary_A_nand_BC_not_B_2x64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxlxor v5, v5, v5
-; CHECK-NEXT: xxlnand vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v3, v3
; CHECK-NEXT: xxsplti32dx v5, 1, 63
; CHECK-NEXT: vsld v2, v2, v5
; CHECK-NEXT: vsrad v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 206
; CHECK-NEXT: blr
entry:
%and = and <2 x i64> %B, %C
@@ -273,11 +245,9 @@ define <16 x i8> @ternary_A_nand_BC_not_B_16x8(<16 x i1> %A, <16 x i8> %B, <16 x
; CHECK-LABEL: ternary_A_nand_BC_not_B_16x8:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltib v5, 7
-; CHECK-NEXT: xxlnand vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v3, v3
; CHECK-NEXT: vslb v2, v2, v5
; CHECK-NEXT: vsrab v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 206
; CHECK-NEXT: blr
entry:
%and = and <16 x i8> %B, %C
@@ -292,11 +262,9 @@ define <8 x i16> @ternary_A_nand_BC_not_B_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i
; CHECK-LABEL: ternary_A_nand_BC_not_B_8x16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltiw v5, 983055
-; CHECK-NEXT: xxlnand vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v3, v3
; CHECK-NEXT: vslh v2, v2, v5
; CHECK-NEXT: vsrah v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 206
; CHECK-NEXT: blr
entry:
%and = and <8 x i16> %B, %C
diff --git a/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-not-c.ll b/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-not-c.ll
index 3479d94..98c1f28 100644
--- a/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-not-c.ll
+++ b/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-not-c.ll
@@ -15,11 +15,9 @@ define <4 x i32> @ternary_A_and_BC_not_C_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i3
; CHECK-LABEL: ternary_A_and_BC_not_C_4x32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxleqv v5, v5, v5
-; CHECK-NEXT: xxland vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v4, v4
; CHECK-NEXT: vslw v2, v2, v5
; CHECK-NEXT: vsraw v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 161
; CHECK-NEXT: blr
entry:
%and = and <4 x i32> %B, %C
@@ -33,12 +31,10 @@ define <2 x i64> @ternary_A_and_BC_not_C_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i6
; CHECK-LABEL: ternary_A_and_BC_not_C_2x64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxlxor v5, v5, v5
-; CHECK-NEXT: xxland vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v4, v4
; CHECK-NEXT: xxsplti32dx v5, 1, 63
; CHECK-NEXT: vsld v2, v2, v5
; CHECK-NEXT: vsrad v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 161
; CHECK-NEXT: blr
entry:
%and = and <2 x i64> %B, %C
@@ -52,11 +48,9 @@ define <16 x i8> @ternary_A_and_BC_not_C_16x8(<16 x i1> %A, <16 x i8> %B, <16 x
; CHECK-LABEL: ternary_A_and_BC_not_C_16x8:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltib v5, 7
-; CHECK-NEXT: xxland vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v4, v4
; CHECK-NEXT: vslb v2, v2, v5
; CHECK-NEXT: vsrab v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 161
; CHECK-NEXT: blr
entry:
%and = and <16 x i8> %B, %C
@@ -70,11 +64,9 @@ define <8 x i16> @ternary_A_and_BC_not_C_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i1
; CHECK-LABEL: ternary_A_and_BC_not_C_8x16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltiw v5, 983055
-; CHECK-NEXT: xxland vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v4, v4
; CHECK-NEXT: vslh v2, v2, v5
; CHECK-NEXT: vsrah v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 161
; CHECK-NEXT: blr
entry:
%and = and <8 x i16> %B, %C
@@ -88,10 +80,9 @@ define <4 x i32> @ternary_A_B_not_C_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C
; CHECK-LABEL: ternary_A_B_not_C_4x32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxleqv v5, v5, v5
-; CHECK-NEXT: xxlnor vs0, v4, v4
; CHECK-NEXT: vslw v2, v2, v5
; CHECK-NEXT: vsraw v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs0, v3, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 163
; CHECK-NEXT: blr
entry:
%not = xor <4 x i32> %C, <i32 -1, i32 -1, i32 -1, i32 -1> ; Vector not operation
@@ -104,11 +95,10 @@ define <2 x i64> @ternary_A_B_not_C_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C
; CHECK-LABEL: ternary_A_B_not_C_2x64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxlxor v5, v5, v5
-; CHECK-NEXT: xxlnor vs0, v4, v4
; CHECK-NEXT: xxsplti32dx v5, 1, 63
; CHECK-NEXT: vsld v2, v2, v5
; CHECK-NEXT: vsrad v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs0, v3, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 163
; CHECK-NEXT: blr
entry:
%not = xor <2 x i64> %C, <i64 -1, i64 -1> ; Vector not operation
@@ -121,10 +111,9 @@ define <16 x i8> @ternary_A_B_not_C_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i8> %
; CHECK-LABEL: ternary_A_B_not_C_16x8:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltib v5, 7
-; CHECK-NEXT: xxlnor vs0, v4, v4
; CHECK-NEXT: vslb v2, v2, v5
; CHECK-NEXT: vsrab v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs0, v3, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 163
; CHECK-NEXT: blr
entry:
%not = xor <16 x i8> %C, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> ; Vector not operation
@@ -137,10 +126,9 @@ define <8 x i16> @ternary_A_B_not_C_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16> %C
; CHECK-LABEL: ternary_A_B_not_C_8x16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltiw v5, 983055
-; CHECK-NEXT: xxlnor vs0, v4, v4
; CHECK-NEXT: vslh v2, v2, v5
; CHECK-NEXT: vsrah v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs0, v3, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 163
; CHECK-NEXT: blr
entry:
%not = xor <8 x i16> %C, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1> ; Vector not operation
@@ -153,11 +141,9 @@ define <4 x i32> @ternary_A_xor_BC_not_C_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i3
; CHECK-LABEL: ternary_A_xor_BC_not_C_4x32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxleqv v5, v5, v5
-; CHECK-NEXT: xxlxor vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v4, v4
; CHECK-NEXT: vslw v2, v2, v5
; CHECK-NEXT: vsraw v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 166
; CHECK-NEXT: blr
entry:
%xor = xor <4 x i32> %B, %C
@@ -171,12 +157,10 @@ define <2 x i64> @ternary_A_xor_BC_not_C_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i6
; CHECK-LABEL: ternary_A_xor_BC_not_C_2x64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxlxor v5, v5, v5
-; CHECK-NEXT: xxlxor vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v4, v4
; CHECK-NEXT: xxsplti32dx v5, 1, 63
; CHECK-NEXT: vsld v2, v2, v5
; CHECK-NEXT: vsrad v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 166
; CHECK-NEXT: blr
entry:
%xor = xor <2 x i64> %B, %C
@@ -190,11 +174,9 @@ define <16 x i8> @ternary_A_xor_BC_not_C_16x8(<16 x i1> %A, <16 x i8> %B, <16 x
; CHECK-LABEL: ternary_A_xor_BC_not_C_16x8:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltib v5, 7
-; CHECK-NEXT: xxlxor vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v4, v4
; CHECK-NEXT: vslb v2, v2, v5
; CHECK-NEXT: vsrab v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 166
; CHECK-NEXT: blr
entry:
%xor = xor <16 x i8> %B, %C
@@ -208,11 +190,9 @@ define <8 x i16> @ternary_A_xor_BC_not_C_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i1
; CHECK-LABEL: ternary_A_xor_BC_not_C_8x16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltiw v5, 983055
-; CHECK-NEXT: xxlxor vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v4, v4
; CHECK-NEXT: vslh v2, v2, v5
; CHECK-NEXT: vsrah v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 166
; CHECK-NEXT: blr
entry:
%xor = xor <8 x i16> %B, %C
@@ -226,11 +206,9 @@ define <4 x i32> @ternary_A_or_BC_not_C_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32
; CHECK-LABEL: ternary_A_or_BC_not_C_4x32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxleqv v5, v5, v5
-; CHECK-NEXT: xxlor vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v4, v4
; CHECK-NEXT: vslw v2, v2, v5
; CHECK-NEXT: vsraw v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 167
; CHECK-NEXT: blr
entry:
%or = or <4 x i32> %B, %C
@@ -244,12 +222,10 @@ define <2 x i64> @ternary_A_or_BC_not_C_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64
; CHECK-LABEL: ternary_A_or_BC_not_C_2x64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxlxor v5, v5, v5
-; CHECK-NEXT: xxlor vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v4, v4
; CHECK-NEXT: xxsplti32dx v5, 1, 63
; CHECK-NEXT: vsld v2, v2, v5
; CHECK-NEXT: vsrad v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 167
; CHECK-NEXT: blr
entry:
%or = or <2 x i64> %B, %C
@@ -263,11 +239,9 @@ define <16 x i8> @ternary_A_or_BC_not_C_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i
; CHECK-LABEL: ternary_A_or_BC_not_C_16x8:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltib v5, 7
-; CHECK-NEXT: xxlor vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v4, v4
; CHECK-NEXT: vslb v2, v2, v5
; CHECK-NEXT: vsrab v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 167
; CHECK-NEXT: blr
entry:
%or = or <16 x i8> %B, %C
@@ -281,11 +255,9 @@ define <8 x i16> @ternary_A_or_BC_not_C_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16
; CHECK-LABEL: ternary_A_or_BC_not_C_8x16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltiw v5, 983055
-; CHECK-NEXT: xxlor vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v4, v4
; CHECK-NEXT: vslh v2, v2, v5
; CHECK-NEXT: vsrah v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 167
; CHECK-NEXT: blr
entry:
%or = or <8 x i16> %B, %C
@@ -299,11 +271,9 @@ define <4 x i32> @ternary_A_not_B_not_C_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32
; CHECK-LABEL: ternary_A_not_B_not_C_4x32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxleqv v5, v5, v5
-; CHECK-NEXT: xxlnor vs0, v3, v3
-; CHECK-NEXT: xxlnor vs1, v4, v4
; CHECK-NEXT: vslw v2, v2, v5
; CHECK-NEXT: vsraw v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 172
; CHECK-NEXT: blr
entry:
%not_b = xor <4 x i32> %B, <i32 -1, i32 -1, i32 -1, i32 -1> ; Vector not operation
@@ -317,12 +287,10 @@ define <2 x i64> @ternary_A_not_B_not_C_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64
; CHECK-LABEL: ternary_A_not_B_not_C_2x64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxlxor v5, v5, v5
-; CHECK-NEXT: xxlnor vs0, v3, v3
-; CHECK-NEXT: xxlnor vs1, v4, v4
; CHECK-NEXT: xxsplti32dx v5, 1, 63
; CHECK-NEXT: vsld v2, v2, v5
; CHECK-NEXT: vsrad v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 172
; CHECK-NEXT: blr
entry:
%not_b = xor <2 x i64> %B, <i64 -1, i64 -1> ; Vector not operation
@@ -336,11 +304,9 @@ define <16 x i8> @ternary_A_not_B_not_C_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i
; CHECK-LABEL: ternary_A_not_B_not_C_16x8:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltib v5, 7
-; CHECK-NEXT: xxlnor vs0, v3, v3
-; CHECK-NEXT: xxlnor vs1, v4, v4
; CHECK-NEXT: vslb v2, v2, v5
; CHECK-NEXT: vsrab v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 172
; CHECK-NEXT: blr
entry:
%not_b = xor <16 x i8> %B, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> ; Vector not operation
@@ -354,11 +320,9 @@ define <8 x i16> @ternary_A_not_B_not_C_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16
; CHECK-LABEL: ternary_A_not_B_not_C_8x16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltiw v5, 983055
-; CHECK-NEXT: xxlnor vs0, v3, v3
-; CHECK-NEXT: xxlnor vs1, v4, v4
; CHECK-NEXT: vslh v2, v2, v5
; CHECK-NEXT: vsrah v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 172
; CHECK-NEXT: blr
entry:
%not_b = xor <8 x i16> %B, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1> ; Vector not operation
@@ -372,11 +336,9 @@ define <4 x i32> @ternary_A_nand_BC_not_C_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i
; CHECK-LABEL: ternary_A_nand_BC_not_C_4x32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxleqv v5, v5, v5
-; CHECK-NEXT: xxlnand vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v4, v4
; CHECK-NEXT: vslw v2, v2, v5
; CHECK-NEXT: vsraw v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 174
; CHECK-NEXT: blr
entry:
%and = and <4 x i32> %B, %C
@@ -391,12 +353,10 @@ define <2 x i64> @ternary_A_nand_BC_not_C_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i
; CHECK-LABEL: ternary_A_nand_BC_not_C_2x64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxlxor v5, v5, v5
-; CHECK-NEXT: xxlnand vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v4, v4
; CHECK-NEXT: xxsplti32dx v5, 1, 63
; CHECK-NEXT: vsld v2, v2, v5
; CHECK-NEXT: vsrad v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 174
; CHECK-NEXT: blr
entry:
%and = and <2 x i64> %B, %C
@@ -411,11 +371,9 @@ define <16 x i8> @ternary_A_nand_BC_not_C_16x8(<16 x i1> %A, <16 x i8> %B, <16 x
; CHECK-LABEL: ternary_A_nand_BC_not_C_16x8:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltib v5, 7
-; CHECK-NEXT: xxlnand vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v4, v4
; CHECK-NEXT: vslb v2, v2, v5
; CHECK-NEXT: vsrab v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 174
; CHECK-NEXT: blr
entry:
%and = and <16 x i8> %B, %C
@@ -430,11 +388,9 @@ define <8 x i16> @ternary_A_nand_BC_not_C_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i
; CHECK-LABEL: ternary_A_nand_BC_not_C_8x16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xxspltiw v5, 983055
-; CHECK-NEXT: xxlnand vs0, v3, v4
-; CHECK-NEXT: xxlnor vs1, v4, v4
; CHECK-NEXT: vslh v2, v2, v5
; CHECK-NEXT: vsrah v2, v2, v5
-; CHECK-NEXT: xxsel v2, vs1, vs0, v2
+; CHECK-NEXT: xxeval v2, v2, v3, v4, 174
; CHECK-NEXT: blr
entry:
%and = and <8 x i16> %B, %C
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir
index 2e500d5..da7546e 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir
@@ -689,8 +689,8 @@
# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
# DEBUG-NEXT: G_INSERT_VECTOR_ELT (opcode {{[0-9]+}}): 3 type indices, 0 imm indices
-# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
# DEBUG-NEXT: G_EXTRACT_VECTOR_ELT (opcode {{[0-9]+}}): 3 type indices, 0 imm indices
# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insertelement-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insertelement-rv32.mir
new file mode 100644
index 0000000..d7c0e80
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insertelement-rv32.mir
@@ -0,0 +1,1742 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+v -run-pass=legalizer %s -o - | FileCheck %s
+
+---
+name: insertelement_nxv1i1_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv1i1_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32)
+ ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 1 x s1>)
+ ; CHECK-NEXT: PseudoRET implicit $v0
+ %1:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+ %2:_(s1) = G_CONSTANT i1 false
+ %3:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32)
+ $v0 = COPY %0(<vscale x 1 x s1>)
+ PseudoRET implicit $v0
+...
+---
+name: insertelement_nxv1i1_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv1i1_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32)
+ ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 1 x s1>)
+ ; CHECK-NEXT: PseudoRET implicit $v0
+ %1:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+ %2:_(s1) = G_CONSTANT i1 true
+ %3:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32)
+ $v0 = COPY %0(<vscale x 1 x s1>)
+ PseudoRET implicit $v0
+...
+---
+name: insertelement_nxv1i1_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+
+ ; CHECK-LABEL: name: insertelement_nxv1i1_2
+ ; CHECK: liveins: $x10, $x11
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[COPY1]](s32)
+ ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 1 x s1>)
+ ; CHECK-NEXT: PseudoRET implicit $v0
+ %2:_(s32) = COPY $x10
+ %0:_(s1) = G_TRUNC %2(s32)
+ %1:_(s32) = COPY $x11
+ %4:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+ %3:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT %4, %0(s1), %1(s32)
+ $v0 = COPY %3(<vscale x 1 x s1>)
+ PseudoRET implicit $v0
+...
+---
+name: insertelement_nxv2i1_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv2i1_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32)
+ ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 2 x s1>)
+ ; CHECK-NEXT: PseudoRET implicit $v0
+ %1:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+ %2:_(s1) = G_CONSTANT i1 false
+ %3:_(s32) = G_CONSTANT i32 1
+ %0:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32)
+ $v0 = COPY %0(<vscale x 2 x s1>)
+ PseudoRET implicit $v0
+...
+---
+name: insertelement_nxv2i1_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv2i1_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32)
+ ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 2 x s1>)
+ ; CHECK-NEXT: PseudoRET implicit $v0
+ %1:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+ %2:_(s1) = G_CONSTANT i1 true
+ %3:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32)
+ $v0 = COPY %0(<vscale x 2 x s1>)
+ PseudoRET implicit $v0
+...
+---
+name: insertelement_nxv2i1_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+
+ ; CHECK-LABEL: name: insertelement_nxv2i1_2
+ ; CHECK: liveins: $x10, $x11
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[COPY1]](s32)
+ ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 2 x s1>)
+ ; CHECK-NEXT: PseudoRET implicit $v0
+ %2:_(s32) = COPY $x10
+ %0:_(s1) = G_TRUNC %2(s32)
+ %1:_(s32) = COPY $x11
+ %4:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+ %3:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT %4, %0(s1), %1(s32)
+ $v0 = COPY %3(<vscale x 2 x s1>)
+ PseudoRET implicit $v0
+...
+---
+name: insertelement_nxv4i1_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv4i1_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32)
+ ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 4 x s1>)
+ ; CHECK-NEXT: PseudoRET implicit $v0
+ %1:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+ %2:_(s1) = G_CONSTANT i1 false
+ %3:_(s32) = G_CONSTANT i32 2
+ %0:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32)
+ $v0 = COPY %0(<vscale x 4 x s1>)
+ PseudoRET implicit $v0
+...
+---
+name: insertelement_nxv4i1_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv4i1_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32)
+ ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 4 x s1>)
+ ; CHECK-NEXT: PseudoRET implicit $v0
+ %1:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+ %2:_(s1) = G_CONSTANT i1 true
+ %3:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32)
+ $v0 = COPY %0(<vscale x 4 x s1>)
+ PseudoRET implicit $v0
+...
+---
+name: insertelement_nxv4i1_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv4i1_2
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C]](s32)
+ ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 4 x s1>)
+ ; CHECK-NEXT: PseudoRET implicit $v0
+ %1:_(s32) = COPY $x10
+ %0:_(s1) = G_TRUNC %1(s32)
+ %3:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+ %4:_(s32) = G_CONSTANT i32 0
+ %2:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT %3, %0(s1), %4(s32)
+ $v0 = COPY %2(<vscale x 4 x s1>)
+ PseudoRET implicit $v0
+...
+---
+name: insertelement_nxv8i1_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv8i1_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32)
+ ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 8 x s1>)
+ ; CHECK-NEXT: PseudoRET implicit $v0
+ %1:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+ %2:_(s1) = G_CONSTANT i1 false
+ %3:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32)
+ $v0 = COPY %0(<vscale x 8 x s1>)
+ PseudoRET implicit $v0
+...
+---
+name: insertelement_nxv8i1_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv8i1_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32)
+ ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 8 x s1>)
+ ; CHECK-NEXT: PseudoRET implicit $v0
+ %1:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+ %2:_(s1) = G_CONSTANT i1 true
+ %3:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32)
+ $v0 = COPY %0(<vscale x 8 x s1>)
+ PseudoRET implicit $v0
+...
+---
+name: insertelement_nxv8i1_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+
+ ; CHECK-LABEL: name: insertelement_nxv8i1_2
+ ; CHECK: liveins: $x10, $x11
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[COPY1]](s32)
+ ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 8 x s1>)
+ ; CHECK-NEXT: PseudoRET implicit $v0
+ %2:_(s32) = COPY $x10
+ %0:_(s1) = G_TRUNC %2(s32)
+ %1:_(s32) = COPY $x11
+ %4:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+ %3:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT %4, %0(s1), %1(s32)
+ $v0 = COPY %3(<vscale x 8 x s1>)
+ PseudoRET implicit $v0
+...
+---
+name: insertelement_nxv16i1_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv16i1_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32)
+ ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 16 x s1>)
+ ; CHECK-NEXT: PseudoRET implicit $v0
+ %1:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+ %2:_(s1) = G_CONSTANT i1 false
+ %3:_(s32) = G_CONSTANT i32 15
+ %0:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32)
+ $v0 = COPY %0(<vscale x 16 x s1>)
+ PseudoRET implicit $v0
+...
+---
+name: insertelement_nxv16i1_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv16i1_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32)
+ ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 16 x s1>)
+ ; CHECK-NEXT: PseudoRET implicit $v0
+ %1:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+ %2:_(s1) = G_CONSTANT i1 true
+ %3:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32)
+ $v0 = COPY %0(<vscale x 16 x s1>)
+ PseudoRET implicit $v0
+...
+---
+name: insertelement_nxv16i1_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+
+ ; CHECK-LABEL: name: insertelement_nxv16i1_2
+ ; CHECK: liveins: $x10, $x11
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[COPY1]](s32)
+ ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 16 x s1>)
+ ; CHECK-NEXT: PseudoRET implicit $v0
+ %2:_(s32) = COPY $x10
+ %0:_(s1) = G_TRUNC %2(s32)
+ %1:_(s32) = COPY $x11
+ %4:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+ %3:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT %4, %0(s1), %1(s32)
+ $v0 = COPY %3(<vscale x 16 x s1>)
+ PseudoRET implicit $v0
+...
+---
+name: insertelement_nxv4i1_3
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $v0, $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv4i1_3
+ ; CHECK: liveins: $v0, $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT [[COPY]], [[TRUNC]](s1), [[C]](s32)
+ ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 4 x s1>)
+ ; CHECK-NEXT: PseudoRET implicit $v0
+ %0:_(<vscale x 4 x s1>) = COPY $v0
+ %2:_(s32) = COPY $x10
+ %1:_(s1) = G_TRUNC %2(s32)
+ %4:_(s32) = G_CONSTANT i32 0
+ %3:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT %0, %1(s1), %4(s32)
+ $v0 = COPY %3(<vscale x 4 x s1>)
+ PseudoRET implicit $v0
+...
+---
+name: insertelement_nxv1i8_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv1i8_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s8>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+ %2:_(s8) = G_CONSTANT i8 0
+ %3:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32)
+ $v8 = COPY %0(<vscale x 1 x s8>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv1i8_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv1i8_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s8>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+ %2:_(s8) = G_CONSTANT i8 -1
+ %3:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32)
+ $v8 = COPY %0(<vscale x 1 x s8>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv1i8_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv1i8_2
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C]](s32)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s8>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(s32) = COPY $x10
+ %0:_(s8) = G_TRUNC %1(s32)
+ %3:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+ %4:_(s32) = G_CONSTANT i32 0
+ %2:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT %3, %0(s8), %4(s32)
+ $v8 = COPY %2(<vscale x 1 x s8>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv2i8_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv2i8_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s8>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+ %2:_(s8) = G_CONSTANT i8 0
+ %3:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32)
+ $v8 = COPY %0(<vscale x 2 x s8>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv2i8_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv2i8_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s8>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+ %2:_(s8) = G_CONSTANT i8 -1
+ %3:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32)
+ $v8 = COPY %0(<vscale x 2 x s8>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv2i8_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv2i8_2
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C]](s32)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s8>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(s32) = COPY $x10
+ %0:_(s8) = G_TRUNC %1(s32)
+ %3:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+ %4:_(s32) = G_CONSTANT i32 0
+ %2:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT %3, %0(s8), %4(s32)
+ $v8 = COPY %2(<vscale x 2 x s8>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv4i8_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv4i8_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s8>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+ %2:_(s8) = G_CONSTANT i8 0
+ %3:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32)
+ $v8 = COPY %0(<vscale x 4 x s8>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv4i8_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv4i8_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s8>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+ %2:_(s8) = G_CONSTANT i8 -1
+ %3:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32)
+ $v8 = COPY %0(<vscale x 4 x s8>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv4i8_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv4i8_2
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C]](s32)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s8>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(s32) = COPY $x10
+ %0:_(s8) = G_TRUNC %1(s32)
+ %3:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+ %4:_(s32) = G_CONSTANT i32 0
+ %2:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT %3, %0(s8), %4(s32)
+ $v8 = COPY %2(<vscale x 4 x s8>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv8i8_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv8i8_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 8 x s8>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+ %2:_(s8) = G_CONSTANT i8 0
+ %3:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32)
+ $v8 = COPY %0(<vscale x 8 x s8>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv8i8_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv8i8_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 8 x s8>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+ %2:_(s8) = G_CONSTANT i8 -1
+ %3:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32)
+ $v8 = COPY %0(<vscale x 8 x s8>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv8i8_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv8i8_2
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C]](s32)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 8 x s8>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(s32) = COPY $x10
+ %0:_(s8) = G_TRUNC %1(s32)
+ %3:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+ %4:_(s32) = G_CONSTANT i32 0
+ %2:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT %3, %0(s8), %4(s32)
+ $v8 = COPY %2(<vscale x 8 x s8>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv16i8_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv16i8_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32)
+ ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 16 x s8>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m2
+ %1:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+ %2:_(s8) = G_CONSTANT i8 0
+ %3:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32)
+ $v8m2 = COPY %0(<vscale x 16 x s8>)
+ PseudoRET implicit $v8m2
+...
+---
+name: insertelement_nxv16i8_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv16i8_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32)
+ ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 16 x s8>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m2
+ %1:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+ %2:_(s8) = G_CONSTANT i8 -1
+ %3:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32)
+ $v8m2 = COPY %0(<vscale x 16 x s8>)
+ PseudoRET implicit $v8m2
+...
+---
+name: insertelement_nxv16i8_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11, $x12
+
+ ; CHECK-LABEL: name: insertelement_nxv16i8_2
+ ; CHECK: liveins: $x10, $x11, $x12
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[COPY1]](s32)
+ ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 16 x s8>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m2
+ %2:_(s32) = COPY $x10
+ %0:_(s8) = G_TRUNC %2(s32)
+ %3:_(s32) = COPY $x11
+ %4:_(s32) = COPY $x12
+ %1:_(s64) = G_MERGE_VALUES %3(s32), %4(s32)
+ %6:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+ %7:_(s32) = G_TRUNC %1(s64)
+ %5:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT %6, %0(s8), %7(s32)
+ $v8m2 = COPY %5(<vscale x 16 x s8>)
+ PseudoRET implicit $v8m2
+...
+---
+name: insertelement_nxv4i8_3
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $v8, $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv4i8_3
+ ; CHECK: liveins: $v8, $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY1]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT [[COPY]], [[TRUNC]](s8), [[C]](s32)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s8>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %0:_(<vscale x 4 x s8>) = COPY $v8
+ %2:_(s32) = COPY $x10
+ %1:_(s8) = G_TRUNC %2(s32)
+ %4:_(s32) = G_CONSTANT i32 0
+ %3:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT %0, %1(s8), %4(s32)
+ $v8 = COPY %3(<vscale x 4 x s8>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv1i16_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv1i16_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s16>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+ %2:_(s16) = G_CONSTANT i16 0
+ %3:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32)
+ $v8 = COPY %0(<vscale x 1 x s16>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv1i16_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv1i16_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s16>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+ %2:_(s16) = G_CONSTANT i16 -1
+ %3:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32)
+ $v8 = COPY %0(<vscale x 1 x s16>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv1i16_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv1i16_2
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s32)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s16>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(s32) = COPY $x10
+ %0:_(s16) = G_TRUNC %1(s32)
+ %3:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+ %4:_(s32) = G_CONSTANT i32 0
+ %2:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s32)
+ $v8 = COPY %2(<vscale x 1 x s16>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv2i16_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv2i16_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s16>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+ %2:_(s16) = G_CONSTANT i16 0
+ %3:_(s32) = G_CONSTANT i32 1
+ %0:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32)
+ $v8 = COPY %0(<vscale x 2 x s16>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv2i16_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv2i16_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s16>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+ %2:_(s16) = G_CONSTANT i16 -1
+ %3:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32)
+ $v8 = COPY %0(<vscale x 2 x s16>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv2i16_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv2i16_2
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s32)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s16>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(s32) = COPY $x10
+ %0:_(s16) = G_TRUNC %1(s32)
+ %3:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+ %4:_(s32) = G_CONSTANT i32 0
+ %2:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s32)
+ $v8 = COPY %2(<vscale x 2 x s16>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv4i16_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv4i16_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s16>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+ %2:_(s16) = G_CONSTANT i16 0
+ %3:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32)
+ $v8 = COPY %0(<vscale x 4 x s16>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv4i16_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv4i16_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s16>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+ %2:_(s16) = G_CONSTANT i16 -1
+ %3:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32)
+ $v8 = COPY %0(<vscale x 4 x s16>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv4i16_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv4i16_2
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s32)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s16>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(s32) = COPY $x10
+ %0:_(s16) = G_TRUNC %1(s32)
+ %3:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+ %4:_(s32) = G_CONSTANT i32 0
+ %2:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s32)
+ $v8 = COPY %2(<vscale x 4 x s16>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv8i16_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv8i16_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32)
+ ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 8 x s16>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m2
+ %1:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+ %2:_(s16) = G_CONSTANT i16 0
+ %3:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32)
+ $v8m2 = COPY %0(<vscale x 8 x s16>)
+ PseudoRET implicit $v8m2
+...
+---
+name: insertelement_nxv8i16_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv8i16_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32)
+ ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 8 x s16>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m2
+ %1:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+ %2:_(s16) = G_CONSTANT i16 -1
+ %3:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32)
+ $v8m2 = COPY %0(<vscale x 8 x s16>)
+ PseudoRET implicit $v8m2
+...
+---
+name: insertelement_nxv8i16_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv8i16_2
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s32)
+ ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 8 x s16>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m2
+ %1:_(s32) = COPY $x10
+ %0:_(s16) = G_TRUNC %1(s32)
+ %3:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+ %4:_(s32) = G_CONSTANT i32 0
+ %2:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s32)
+ $v8m2 = COPY %2(<vscale x 8 x s16>)
+ PseudoRET implicit $v8m2
+...
+---
+name: insertelement_nxv16i16_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv16i16_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32)
+ ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 16 x s16>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m4
+ %1:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+ %2:_(s16) = G_CONSTANT i16 0
+ %3:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32)
+ $v8m4 = COPY %0(<vscale x 16 x s16>)
+ PseudoRET implicit $v8m4
+...
+---
+name: insertelement_nxv16i16_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv16i16_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32)
+ ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 16 x s16>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m4
+ %1:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+ %2:_(s16) = G_CONSTANT i16 -1
+ %3:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32)
+ $v8m4 = COPY %0(<vscale x 16 x s16>)
+ PseudoRET implicit $v8m4
+...
+---
+name: insertelement_nxv16i16_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv16i16_2
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s32)
+ ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 16 x s16>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m4
+ %1:_(s32) = COPY $x10
+ %0:_(s16) = G_TRUNC %1(s32)
+ %3:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+ %4:_(s32) = G_CONSTANT i32 0
+ %2:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s32)
+ $v8m4 = COPY %2(<vscale x 16 x s16>)
+ PseudoRET implicit $v8m4
+...
+---
+name: insertelement_nxv4i16
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $v8, $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv4i16
+ ; CHECK: liveins: $v8, $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT [[COPY]], [[TRUNC]](s16), [[C]](s32)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s16>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %0:_(<vscale x 4 x s16>) = COPY $v8
+ %2:_(s32) = COPY $x10
+ %1:_(s16) = G_TRUNC %2(s32)
+ %4:_(s32) = G_CONSTANT i32 0
+ %3:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT %0, %1(s16), %4(s32)
+ $v8 = COPY %3(<vscale x 4 x s16>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv1i32_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv1i32_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C]](s32)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s32>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+ %2:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %2(s32)
+ $v8 = COPY %0(<vscale x 1 x s32>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv1i32_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv1i32_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C1]](s32)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s32>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+ %2:_(s32) = G_CONSTANT i32 -1
+ %3:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s32)
+ $v8 = COPY %0(<vscale x 1 x s32>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv1i32_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv1i32_2
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s32), [[C]](s32)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s32>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %0:_(s32) = COPY $x10
+ %2:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+ %3:_(s32) = G_CONSTANT i32 0
+ %1:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32)
+ $v8 = COPY %1(<vscale x 1 x s32>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv2i32_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv2i32_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C]](s32)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s32>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+ %2:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %2(s32)
+ $v8 = COPY %0(<vscale x 2 x s32>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv2i32_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv2i32_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C1]](s32)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s32>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+ %2:_(s32) = G_CONSTANT i32 -1
+ %3:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s32)
+ $v8 = COPY %0(<vscale x 2 x s32>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv2i32_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv2i32_2
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s32), [[C]](s32)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s32>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %0:_(s32) = COPY $x10
+ %2:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+ %3:_(s32) = G_CONSTANT i32 0
+ %1:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32)
+ $v8 = COPY %1(<vscale x 2 x s32>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv4i32_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv4i32_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C]](s32)
+ ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 4 x s32>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m2
+ %1:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+ %2:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %2(s32)
+ $v8m2 = COPY %0(<vscale x 4 x s32>)
+ PseudoRET implicit $v8m2
+...
+---
+name: insertelement_nxv4i32_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv4i32_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C1]](s32)
+ ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 4 x s32>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m2
+ %1:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+ %2:_(s32) = G_CONSTANT i32 -1
+ %3:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s32)
+ $v8m2 = COPY %0(<vscale x 4 x s32>)
+ PseudoRET implicit $v8m2
+...
+---
+name: insertelement_nxv4i32_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv4i32_2
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s32), [[C]](s32)
+ ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 4 x s32>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m2
+ %0:_(s32) = COPY $x10
+ %2:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+ %3:_(s32) = G_CONSTANT i32 0
+ %1:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32)
+ $v8m2 = COPY %1(<vscale x 4 x s32>)
+ PseudoRET implicit $v8m2
+...
+---
+name: insertelement_nxv8i32_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv8i32_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C]](s32)
+ ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 8 x s32>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m4
+ %1:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+ %2:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %2(s32)
+ $v8m4 = COPY %0(<vscale x 8 x s32>)
+ PseudoRET implicit $v8m4
+...
+---
+name: insertelement_nxv8i32_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv8i32_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C1]](s32)
+ ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 8 x s32>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m4
+ %1:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+ %2:_(s32) = G_CONSTANT i32 -1
+ %3:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s32)
+ $v8m4 = COPY %0(<vscale x 8 x s32>)
+ PseudoRET implicit $v8m4
+...
+---
+name: insertelement_nxv8i32_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv8i32_2
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s32), [[C]](s32)
+ ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 8 x s32>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m4
+ %0:_(s32) = COPY $x10
+ %2:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+ %3:_(s32) = G_CONSTANT i32 0
+ %1:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32)
+ $v8m4 = COPY %1(<vscale x 8 x s32>)
+ PseudoRET implicit $v8m4
+...
+---
+name: insertelement_nxv16i32_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv16i32_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C]](s32)
+ ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 16 x s32>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m8
+ %1:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+ %2:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %2(s32)
+ $v8m8 = COPY %0(<vscale x 16 x s32>)
+ PseudoRET implicit $v8m8
+...
+---
+name: insertelement_nxv16i32_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv16i32_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C1]](s32)
+ ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 16 x s32>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m8
+ %1:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+ %2:_(s32) = G_CONSTANT i32 -1
+ %3:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s32)
+ $v8m8 = COPY %0(<vscale x 16 x s32>)
+ PseudoRET implicit $v8m8
+...
+---
+name: insertelement_nxv16i32_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv16i32_2
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s32), [[C]](s32)
+ ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 16 x s32>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m8
+ %0:_(s32) = COPY $x10
+ %2:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+ %3:_(s32) = G_CONSTANT i32 0
+ %1:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32)
+ $v8m8 = COPY %1(<vscale x 16 x s32>)
+ PseudoRET implicit $v8m8
+...
+---
+name: insertelement_nxv4i32
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $v8m2
+
+ ; CHECK-LABEL: name: insertelement_nxv4i32
+ ; CHECK: liveins: $x10, $v8m2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $v8m2
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x10
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s32), [[C]](s32)
+ ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 4 x s32>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m2
+ %0:_(<vscale x 4 x s32>) = COPY $v8m2
+ %1:_(s32) = COPY $x10
+ %3:_(s32) = G_CONSTANT i32 0
+ %2:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT %0, %1(s32), %3(s32)
+ $v8m2 = COPY %2(<vscale x 4 x s32>)
+ PseudoRET implicit $v8m2
+...
+---
+name: insertelement_nxv1i64_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv1i64_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C]](s32)
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C]](s32)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s64>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+ %2:_(s64) = G_CONSTANT i64 0
+ %3:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s32)
+ $v8 = COPY %0(<vscale x 1 x s64>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv1i64_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv1i64_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C1]](s32)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s64>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+ %2:_(s64) = G_CONSTANT i64 -1
+ %3:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s32)
+ $v8 = COPY %0(<vscale x 1 x s64>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv1i64_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+
+ ; CHECK-LABEL: name: insertelement_nxv1i64_2
+ ; CHECK: liveins: $x10, $x11
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C]](s32)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s64>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(s32) = COPY $x10
+ %2:_(s32) = COPY $x11
+ %0:_(s64) = G_MERGE_VALUES %1(s32), %2(s32)
+ %4:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+ %5:_(s32) = G_CONSTANT i32 0
+ %3:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT %4, %0(s64), %5(s32)
+ $v8 = COPY %3(<vscale x 1 x s64>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv2i64_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv2i64_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C]](s32)
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C]](s32)
+ ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 2 x s64>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m2
+ %1:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+ %2:_(s64) = G_CONSTANT i64 0
+ %3:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s32)
+ $v8m2 = COPY %0(<vscale x 2 x s64>)
+ PseudoRET implicit $v8m2
+...
+---
+name: insertelement_nxv2i64_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv2i64_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C1]](s32)
+ ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 2 x s64>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m2
+ %1:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+ %2:_(s64) = G_CONSTANT i64 -1
+ %3:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s32)
+ $v8m2 = COPY %0(<vscale x 2 x s64>)
+ PseudoRET implicit $v8m2
+...
+---
+name: insertelement_nxv2i64_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+
+ ; CHECK-LABEL: name: insertelement_nxv2i64_2
+ ; CHECK: liveins: $x10, $x11
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C]](s32)
+ ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 2 x s64>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m2
+ %1:_(s32) = COPY $x10
+ %2:_(s32) = COPY $x11
+ %0:_(s64) = G_MERGE_VALUES %1(s32), %2(s32)
+ %4:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+ %5:_(s32) = G_CONSTANT i32 0
+ %3:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT %4, %0(s64), %5(s32)
+ $v8m2 = COPY %3(<vscale x 2 x s64>)
+ PseudoRET implicit $v8m2
+...
+---
+name: insertelement_nxv4i64_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv4i64_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C]](s32)
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C]](s32)
+ ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 4 x s64>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m4
+ %1:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+ %2:_(s64) = G_CONSTANT i64 0
+ %3:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s32)
+ $v8m4 = COPY %0(<vscale x 4 x s64>)
+ PseudoRET implicit $v8m4
+...
+---
+name: insertelement_nxv4i64_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv4i64_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C1]](s32)
+ ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 4 x s64>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m4
+ %1:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+ %2:_(s64) = G_CONSTANT i64 -1
+ %3:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s32)
+ $v8m4 = COPY %0(<vscale x 4 x s64>)
+ PseudoRET implicit $v8m4
+...
+---
+name: insertelement_nxv4i64_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+
+ ; CHECK-LABEL: name: insertelement_nxv4i64_2
+ ; CHECK: liveins: $x10, $x11
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C]](s32)
+ ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 4 x s64>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m4
+ %1:_(s32) = COPY $x10
+ %2:_(s32) = COPY $x11
+ %0:_(s64) = G_MERGE_VALUES %1(s32), %2(s32)
+ %4:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+ %5:_(s32) = G_CONSTANT i32 0
+ %3:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT %4, %0(s64), %5(s32)
+ $v8m4 = COPY %3(<vscale x 4 x s64>)
+ PseudoRET implicit $v8m4
+...
+---
+name: insertelement_nxv8i64_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv8i64_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C]](s32)
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C]](s32)
+ ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 8 x s64>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m8
+ %1:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+ %2:_(s64) = G_CONSTANT i64 0
+ %3:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s32)
+ $v8m8 = COPY %0(<vscale x 8 x s64>)
+ PseudoRET implicit $v8m8
+...
+---
+name: insertelement_nxv8i64_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv8i64_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C1]](s32)
+ ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 8 x s64>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m8
+ %1:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+ %2:_(s64) = G_CONSTANT i64 -1
+ %3:_(s32) = G_CONSTANT i32 0
+ %0:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s32)
+ $v8m8 = COPY %0(<vscale x 8 x s64>)
+ PseudoRET implicit $v8m8
+...
+---
+name: insertelement_nxv8i64_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+
+ ; CHECK-LABEL: name: insertelement_nxv8i64_2
+ ; CHECK: liveins: $x10, $x11
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C]](s32)
+ ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 8 x s64>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m8
+ %1:_(s32) = COPY $x10
+ %2:_(s32) = COPY $x11
+ %0:_(s64) = G_MERGE_VALUES %1(s32), %2(s32)
+ %4:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+ %5:_(s32) = G_CONSTANT i32 0
+ %3:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT %4, %0(s64), %5(s32)
+ $v8m8 = COPY %3(<vscale x 8 x s64>)
+ PseudoRET implicit $v8m8
+...
+---
+name: insertelement_nxv4i64
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11, $v8m4
+
+ ; CHECK-LABEL: name: insertelement_nxv4i64
+ ; CHECK: liveins: $x10, $x11, $v8m4
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s64>) = COPY $v8m4
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x10
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $x11
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT [[COPY]], [[MV]](s64), [[C]](s32)
+ ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 4 x s64>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m4
+ %0:_(<vscale x 4 x s64>) = COPY $v8m4
+ %2:_(s32) = COPY $x10
+ %3:_(s32) = COPY $x11
+ %1:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
+ %5:_(s32) = G_CONSTANT i32 0
+ %4:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT %0, %1(s64), %5(s32)
+ $v8m4 = COPY %4(<vscale x 4 x s64>)
+ PseudoRET implicit $v8m4
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insertelement-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insertelement-rv64.mir
new file mode 100644
index 0000000..4c33ddc
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insertelement-rv64.mir
@@ -0,0 +1,1731 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv64 -mattr=+v -run-pass=legalizer %s -o - | FileCheck %s
+
+---
+name: insertelement_nxv1i1_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv1i1_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64)
+ ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 1 x s1>)
+ ; CHECK-NEXT: PseudoRET implicit $v0
+ %1:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+ %2:_(s1) = G_CONSTANT i1 false
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64)
+ $v0 = COPY %0(<vscale x 1 x s1>)
+ PseudoRET implicit $v0
+...
+---
+name: insertelement_nxv1i1_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv1i1_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64)
+ ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 1 x s1>)
+ ; CHECK-NEXT: PseudoRET implicit $v0
+ %1:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+ %2:_(s1) = G_CONSTANT i1 true
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64)
+ $v0 = COPY %0(<vscale x 1 x s1>)
+ PseudoRET implicit $v0
+...
+---
+name: insertelement_nxv1i1_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+
+ ; CHECK-LABEL: name: insertelement_nxv1i1_2
+ ; CHECK: liveins: $x10, $x11
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[AND]](s64)
+ ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 1 x s1>)
+ ; CHECK-NEXT: PseudoRET implicit $v0
+ %2:_(s64) = COPY $x10
+ %0:_(s1) = G_TRUNC %2(s64)
+ %3:_(s64) = COPY $x11
+ %1:_(s32) = G_TRUNC %3(s64)
+ %5:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+ %6:_(s64) = G_ZEXT %1(s32)
+ %4:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT %5, %0(s1), %6(s64)
+ $v0 = COPY %4(<vscale x 1 x s1>)
+ PseudoRET implicit $v0
+...
+---
+name: insertelement_nxv2i1_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv2i1_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64)
+ ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 2 x s1>)
+ ; CHECK-NEXT: PseudoRET implicit $v0
+ %1:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+ %2:_(s1) = G_CONSTANT i1 false
+ %3:_(s64) = G_CONSTANT i64 1
+ %0:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64)
+ $v0 = COPY %0(<vscale x 2 x s1>)
+ PseudoRET implicit $v0
+...
+---
+name: insertelement_nxv2i1_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv2i1_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64)
+ ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 2 x s1>)
+ ; CHECK-NEXT: PseudoRET implicit $v0
+ %1:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+ %2:_(s1) = G_CONSTANT i1 true
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64)
+ $v0 = COPY %0(<vscale x 2 x s1>)
+ PseudoRET implicit $v0
+...
+---
+name: insertelement_nxv2i1_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+
+ ; CHECK-LABEL: name: insertelement_nxv2i1_2
+ ; CHECK: liveins: $x10, $x11
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[AND]](s64)
+ ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 2 x s1>)
+ ; CHECK-NEXT: PseudoRET implicit $v0
+ %2:_(s64) = COPY $x10
+ %0:_(s1) = G_TRUNC %2(s64)
+ %3:_(s64) = COPY $x11
+ %1:_(s32) = G_TRUNC %3(s64)
+ %5:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+ %6:_(s64) = G_ZEXT %1(s32)
+ %4:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT %5, %0(s1), %6(s64)
+ $v0 = COPY %4(<vscale x 2 x s1>)
+ PseudoRET implicit $v0
+...
+---
+name: insertelement_nxv4i1_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv4i1_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64)
+ ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 4 x s1>)
+ ; CHECK-NEXT: PseudoRET implicit $v0
+ %1:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+ %2:_(s1) = G_CONSTANT i1 false
+ %3:_(s64) = G_CONSTANT i64 2
+ %0:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64)
+ $v0 = COPY %0(<vscale x 4 x s1>)
+ PseudoRET implicit $v0
+...
+---
+name: insertelement_nxv4i1_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv4i1_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64)
+ ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 4 x s1>)
+ ; CHECK-NEXT: PseudoRET implicit $v0
+ %1:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+ %2:_(s1) = G_CONSTANT i1 true
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64)
+ $v0 = COPY %0(<vscale x 4 x s1>)
+ PseudoRET implicit $v0
+...
+---
+name: insertelement_nxv4i1_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv4i1_2
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C]](s64)
+ ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 4 x s1>)
+ ; CHECK-NEXT: PseudoRET implicit $v0
+ %1:_(s64) = COPY $x10
+ %0:_(s1) = G_TRUNC %1(s64)
+ %3:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+ %4:_(s64) = G_CONSTANT i64 0
+ %2:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT %3, %0(s1), %4(s64)
+ $v0 = COPY %2(<vscale x 4 x s1>)
+ PseudoRET implicit $v0
+...
+---
+name: insertelement_nxv8i1_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv8i1_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64)
+ ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 8 x s1>)
+ ; CHECK-NEXT: PseudoRET implicit $v0
+ %1:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+ %2:_(s1) = G_CONSTANT i1 false
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64)
+ $v0 = COPY %0(<vscale x 8 x s1>)
+ PseudoRET implicit $v0
+...
+---
+name: insertelement_nxv8i1_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv8i1_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64)
+ ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 8 x s1>)
+ ; CHECK-NEXT: PseudoRET implicit $v0
+ %1:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+ %2:_(s1) = G_CONSTANT i1 true
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64)
+ $v0 = COPY %0(<vscale x 8 x s1>)
+ PseudoRET implicit $v0
+...
+---
+name: insertelement_nxv8i1_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+
+ ; CHECK-LABEL: name: insertelement_nxv8i1_2
+ ; CHECK: liveins: $x10, $x11
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[AND]](s64)
+ ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 8 x s1>)
+ ; CHECK-NEXT: PseudoRET implicit $v0
+ %2:_(s64) = COPY $x10
+ %0:_(s1) = G_TRUNC %2(s64)
+ %3:_(s64) = COPY $x11
+ %1:_(s32) = G_TRUNC %3(s64)
+ %5:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+ %6:_(s64) = G_ZEXT %1(s32)
+ %4:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT %5, %0(s1), %6(s64)
+ $v0 = COPY %4(<vscale x 8 x s1>)
+ PseudoRET implicit $v0
+...
+---
+name: insertelement_nxv16i1_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv16i1_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64)
+ ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 16 x s1>)
+ ; CHECK-NEXT: PseudoRET implicit $v0
+ %1:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+ %2:_(s1) = G_CONSTANT i1 false
+ %3:_(s64) = G_CONSTANT i64 15
+ %0:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64)
+ $v0 = COPY %0(<vscale x 16 x s1>)
+ PseudoRET implicit $v0
+...
+---
+name: insertelement_nxv16i1_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv16i1_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64)
+ ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 16 x s1>)
+ ; CHECK-NEXT: PseudoRET implicit $v0
+ %1:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+ %2:_(s1) = G_CONSTANT i1 true
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64)
+ $v0 = COPY %0(<vscale x 16 x s1>)
+ PseudoRET implicit $v0
+...
+---
+name: insertelement_nxv16i1_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+
+ ; CHECK-LABEL: name: insertelement_nxv16i1_2
+ ; CHECK: liveins: $x10, $x11
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[AND]](s64)
+ ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 16 x s1>)
+ ; CHECK-NEXT: PseudoRET implicit $v0
+ %2:_(s64) = COPY $x10
+ %0:_(s1) = G_TRUNC %2(s64)
+ %3:_(s64) = COPY $x11
+ %1:_(s32) = G_TRUNC %3(s64)
+ %5:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+ %6:_(s64) = G_ZEXT %1(s32)
+ %4:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT %5, %0(s1), %6(s64)
+ $v0 = COPY %4(<vscale x 16 x s1>)
+ PseudoRET implicit $v0
+...
+---
+name: insertelement_nxv4i1_3
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $v0, $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv4i1_3
+ ; CHECK: liveins: $v0, $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s64)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT [[COPY]], [[TRUNC]](s1), [[C]](s64)
+ ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 4 x s1>)
+ ; CHECK-NEXT: PseudoRET implicit $v0
+ %0:_(<vscale x 4 x s1>) = COPY $v0
+ %2:_(s64) = COPY $x10
+ %1:_(s1) = G_TRUNC %2(s64)
+ %4:_(s64) = G_CONSTANT i64 0
+ %3:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT %0, %1(s1), %4(s64)
+ $v0 = COPY %3(<vscale x 4 x s1>)
+ PseudoRET implicit $v0
+...
+---
+name: insertelement_nxv1i8_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv1i8_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s8>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+ %2:_(s8) = G_CONSTANT i8 0
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64)
+ $v8 = COPY %0(<vscale x 1 x s8>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv1i8_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv1i8_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s8>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+ %2:_(s8) = G_CONSTANT i8 -1
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64)
+ $v8 = COPY %0(<vscale x 1 x s8>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv1i8_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv1i8_2
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s64)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C]](s64)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s8>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(s64) = COPY $x10
+ %0:_(s8) = G_TRUNC %1(s64)
+ %3:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+ %4:_(s64) = G_CONSTANT i64 0
+ %2:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT %3, %0(s8), %4(s64)
+ $v8 = COPY %2(<vscale x 1 x s8>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv2i8_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv2i8_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s8>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+ %2:_(s8) = G_CONSTANT i8 0
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64)
+ $v8 = COPY %0(<vscale x 2 x s8>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv2i8_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv2i8_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s8>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+ %2:_(s8) = G_CONSTANT i8 -1
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64)
+ $v8 = COPY %0(<vscale x 2 x s8>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv2i8_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv2i8_2
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s64)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C]](s64)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s8>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(s64) = COPY $x10
+ %0:_(s8) = G_TRUNC %1(s64)
+ %3:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+ %4:_(s64) = G_CONSTANT i64 0
+ %2:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT %3, %0(s8), %4(s64)
+ $v8 = COPY %2(<vscale x 2 x s8>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv4i8_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv4i8_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s8>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+ %2:_(s8) = G_CONSTANT i8 0
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64)
+ $v8 = COPY %0(<vscale x 4 x s8>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv4i8_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv4i8_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s8>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+ %2:_(s8) = G_CONSTANT i8 -1
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64)
+ $v8 = COPY %0(<vscale x 4 x s8>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv4i8_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv4i8_2
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s64)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C]](s64)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s8>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(s64) = COPY $x10
+ %0:_(s8) = G_TRUNC %1(s64)
+ %3:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+ %4:_(s64) = G_CONSTANT i64 0
+ %2:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT %3, %0(s8), %4(s64)
+ $v8 = COPY %2(<vscale x 4 x s8>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv8i8_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv8i8_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 8 x s8>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+ %2:_(s8) = G_CONSTANT i8 0
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64)
+ $v8 = COPY %0(<vscale x 8 x s8>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv8i8_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv8i8_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 8 x s8>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+ %2:_(s8) = G_CONSTANT i8 -1
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64)
+ $v8 = COPY %0(<vscale x 8 x s8>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv8i8_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv8i8_2
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s64)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C]](s64)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 8 x s8>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(s64) = COPY $x10
+ %0:_(s8) = G_TRUNC %1(s64)
+ %3:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+ %4:_(s64) = G_CONSTANT i64 0
+ %2:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT %3, %0(s8), %4(s64)
+ $v8 = COPY %2(<vscale x 8 x s8>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv16i8_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv16i8_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64)
+ ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 16 x s8>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m2
+ %1:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+ %2:_(s8) = G_CONSTANT i8 0
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64)
+ $v8m2 = COPY %0(<vscale x 16 x s8>)
+ PseudoRET implicit $v8m2
+...
+---
+name: insertelement_nxv16i8_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv16i8_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64)
+ ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 16 x s8>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m2
+ %1:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+ %2:_(s8) = G_CONSTANT i8 -1
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64)
+ $v8m2 = COPY %0(<vscale x 16 x s8>)
+ PseudoRET implicit $v8m2
+...
+---
+name: insertelement_nxv16i8_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+
+ ; CHECK-LABEL: name: insertelement_nxv16i8_2
+ ; CHECK: liveins: $x10, $x11
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s64)
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[COPY1]](s64)
+ ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 16 x s8>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m2
+ %2:_(s64) = COPY $x10
+ %0:_(s8) = G_TRUNC %2(s64)
+ %1:_(s64) = COPY $x11
+ %4:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+ %3:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT %4, %0(s8), %1(s64)
+ $v8m2 = COPY %3(<vscale x 16 x s8>)
+ PseudoRET implicit $v8m2
+...
+---
+name: insertelement_nxv4i8_3
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $v8, $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv4i8_3
+ ; CHECK: liveins: $v8, $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY1]](s64)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT [[COPY]], [[TRUNC]](s8), [[C]](s64)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s8>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %0:_(<vscale x 4 x s8>) = COPY $v8
+ %2:_(s64) = COPY $x10
+ %1:_(s8) = G_TRUNC %2(s64)
+ %4:_(s64) = G_CONSTANT i64 0
+ %3:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT %0, %1(s8), %4(s64)
+ $v8 = COPY %3(<vscale x 4 x s8>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv1i16_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv1i16_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s16>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+ %2:_(s16) = G_CONSTANT i16 0
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64)
+ $v8 = COPY %0(<vscale x 1 x s16>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv1i16_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv1i16_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s16>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+ %2:_(s16) = G_CONSTANT i16 -1
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64)
+ $v8 = COPY %0(<vscale x 1 x s16>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv1i16_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv1i16_2
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s64)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s16>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(s64) = COPY $x10
+ %0:_(s16) = G_TRUNC %1(s64)
+ %3:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+ %4:_(s64) = G_CONSTANT i64 0
+ %2:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s64)
+ $v8 = COPY %2(<vscale x 1 x s16>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv2i16_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv2i16_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s16>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+ %2:_(s16) = G_CONSTANT i16 0
+ %3:_(s64) = G_CONSTANT i64 1
+ %0:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64)
+ $v8 = COPY %0(<vscale x 2 x s16>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv2i16_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv2i16_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s16>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+ %2:_(s16) = G_CONSTANT i16 -1
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64)
+ $v8 = COPY %0(<vscale x 2 x s16>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv2i16_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv2i16_2
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s64)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s16>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(s64) = COPY $x10
+ %0:_(s16) = G_TRUNC %1(s64)
+ %3:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+ %4:_(s64) = G_CONSTANT i64 0
+ %2:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s64)
+ $v8 = COPY %2(<vscale x 2 x s16>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv4i16_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv4i16_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s16>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+ %2:_(s16) = G_CONSTANT i16 0
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64)
+ $v8 = COPY %0(<vscale x 4 x s16>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv4i16_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv4i16_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s16>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+ %2:_(s16) = G_CONSTANT i16 -1
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64)
+ $v8 = COPY %0(<vscale x 4 x s16>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv4i16_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv4i16_2
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s64)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s16>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(s64) = COPY $x10
+ %0:_(s16) = G_TRUNC %1(s64)
+ %3:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+ %4:_(s64) = G_CONSTANT i64 0
+ %2:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s64)
+ $v8 = COPY %2(<vscale x 4 x s16>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv8i16_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv8i16_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64)
+ ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 8 x s16>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m2
+ %1:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+ %2:_(s16) = G_CONSTANT i16 0
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64)
+ $v8m2 = COPY %0(<vscale x 8 x s16>)
+ PseudoRET implicit $v8m2
+...
+---
+name: insertelement_nxv8i16_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv8i16_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64)
+ ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 8 x s16>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m2
+ %1:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+ %2:_(s16) = G_CONSTANT i16 -1
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64)
+ $v8m2 = COPY %0(<vscale x 8 x s16>)
+ PseudoRET implicit $v8m2
+...
+---
+name: insertelement_nxv8i16_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv8i16_2
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s64)
+ ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 8 x s16>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m2
+ %1:_(s64) = COPY $x10
+ %0:_(s16) = G_TRUNC %1(s64)
+ %3:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+ %4:_(s64) = G_CONSTANT i64 0
+ %2:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s64)
+ $v8m2 = COPY %2(<vscale x 8 x s16>)
+ PseudoRET implicit $v8m2
+...
+---
+name: insertelement_nxv16i16_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv16i16_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64)
+ ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 16 x s16>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m4
+ %1:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+ %2:_(s16) = G_CONSTANT i16 0
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64)
+ $v8m4 = COPY %0(<vscale x 16 x s16>)
+ PseudoRET implicit $v8m4
+...
+---
+name: insertelement_nxv16i16_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv16i16_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64)
+ ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 16 x s16>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m4
+ %1:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+ %2:_(s16) = G_CONSTANT i16 -1
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64)
+ $v8m4 = COPY %0(<vscale x 16 x s16>)
+ PseudoRET implicit $v8m4
+...
+---
+name: insertelement_nxv16i16_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv16i16_2
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s64)
+ ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 16 x s16>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m4
+ %1:_(s64) = COPY $x10
+ %0:_(s16) = G_TRUNC %1(s64)
+ %3:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+ %4:_(s64) = G_CONSTANT i64 0
+ %2:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s64)
+ $v8m4 = COPY %2(<vscale x 16 x s16>)
+ PseudoRET implicit $v8m4
+...
+---
+name: insertelement_nxv4i16
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $v8, $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv4i16
+ ; CHECK: liveins: $v8, $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s64)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT [[COPY]], [[TRUNC]](s16), [[C]](s64)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s16>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %0:_(<vscale x 4 x s16>) = COPY $v8
+ %2:_(s64) = COPY $x10
+ %1:_(s16) = G_TRUNC %2(s64)
+ %4:_(s64) = G_CONSTANT i64 0
+ %3:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT %0, %1(s16), %4(s64)
+ $v8 = COPY %3(<vscale x 4 x s16>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv1i32_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv1i32_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s32>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+ %2:_(s32) = G_CONSTANT i32 0
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64)
+ $v8 = COPY %0(<vscale x 1 x s32>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv1i32_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv1i32_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s32>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+ %2:_(s32) = G_CONSTANT i32 -1
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64)
+ $v8 = COPY %0(<vscale x 1 x s32>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv1i32_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv1i32_2
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C]](s64)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s32>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(s64) = COPY $x10
+ %0:_(s32) = G_TRUNC %1(s64)
+ %3:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+ %4:_(s64) = G_CONSTANT i64 0
+ %2:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT %3, %0(s32), %4(s64)
+ $v8 = COPY %2(<vscale x 1 x s32>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv2i32_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv2i32_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s32>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+ %2:_(s32) = G_CONSTANT i32 0
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64)
+ $v8 = COPY %0(<vscale x 2 x s32>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv2i32_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv2i32_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s32>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+ %2:_(s32) = G_CONSTANT i32 -1
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64)
+ $v8 = COPY %0(<vscale x 2 x s32>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv2i32_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv2i32_2
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C]](s64)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s32>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(s64) = COPY $x10
+ %0:_(s32) = G_TRUNC %1(s64)
+ %3:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+ %4:_(s64) = G_CONSTANT i64 0
+ %2:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT %3, %0(s32), %4(s64)
+ $v8 = COPY %2(<vscale x 2 x s32>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv4i32_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv4i32_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64)
+ ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 4 x s32>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m2
+ %1:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+ %2:_(s32) = G_CONSTANT i32 0
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64)
+ $v8m2 = COPY %0(<vscale x 4 x s32>)
+ PseudoRET implicit $v8m2
+...
+---
+name: insertelement_nxv4i32_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv4i32_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64)
+ ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 4 x s32>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m2
+ %1:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+ %2:_(s32) = G_CONSTANT i32 -1
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64)
+ $v8m2 = COPY %0(<vscale x 4 x s32>)
+ PseudoRET implicit $v8m2
+...
+---
+name: insertelement_nxv4i32_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv4i32_2
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C]](s64)
+ ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 4 x s32>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m2
+ %1:_(s64) = COPY $x10
+ %0:_(s32) = G_TRUNC %1(s64)
+ %3:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+ %4:_(s64) = G_CONSTANT i64 0
+ %2:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT %3, %0(s32), %4(s64)
+ $v8m2 = COPY %2(<vscale x 4 x s32>)
+ PseudoRET implicit $v8m2
+...
+---
+name: insertelement_nxv8i32_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv8i32_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64)
+ ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 8 x s32>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m4
+ %1:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+ %2:_(s32) = G_CONSTANT i32 0
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64)
+ $v8m4 = COPY %0(<vscale x 8 x s32>)
+ PseudoRET implicit $v8m4
+...
+---
+name: insertelement_nxv8i32_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv8i32_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64)
+ ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 8 x s32>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m4
+ %1:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+ %2:_(s32) = G_CONSTANT i32 -1
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64)
+ $v8m4 = COPY %0(<vscale x 8 x s32>)
+ PseudoRET implicit $v8m4
+...
+---
+name: insertelement_nxv8i32_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv8i32_2
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C]](s64)
+ ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 8 x s32>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m4
+ %1:_(s64) = COPY $x10
+ %0:_(s32) = G_TRUNC %1(s64)
+ %3:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+ %4:_(s64) = G_CONSTANT i64 0
+ %2:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT %3, %0(s32), %4(s64)
+ $v8m4 = COPY %2(<vscale x 8 x s32>)
+ PseudoRET implicit $v8m4
+...
+---
+name: insertelement_nxv16i32_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv16i32_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64)
+ ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 16 x s32>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m8
+ %1:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+ %2:_(s32) = G_CONSTANT i32 0
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64)
+ $v8m8 = COPY %0(<vscale x 16 x s32>)
+ PseudoRET implicit $v8m8
+...
+---
+name: insertelement_nxv16i32_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv16i32_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64)
+ ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 16 x s32>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m8
+ %1:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+ %2:_(s32) = G_CONSTANT i32 -1
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64)
+ $v8m8 = COPY %0(<vscale x 16 x s32>)
+ PseudoRET implicit $v8m8
+...
+---
+name: insertelement_nxv16i32_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv16i32_2
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C]](s64)
+ ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 16 x s32>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m8
+ %1:_(s64) = COPY $x10
+ %0:_(s32) = G_TRUNC %1(s64)
+ %3:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+ %4:_(s64) = G_CONSTANT i64 0
+ %2:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT %3, %0(s32), %4(s64)
+ $v8m8 = COPY %2(<vscale x 16 x s32>)
+ PseudoRET implicit $v8m8
+...
+---
+name: insertelement_nxv4i32
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $v8m2
+
+ ; CHECK-LABEL: name: insertelement_nxv4i32
+ ; CHECK: liveins: $x10, $v8m2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $v8m2
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x10
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[TRUNC]](s32), [[C]](s64)
+ ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 4 x s32>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m2
+ %0:_(<vscale x 4 x s32>) = COPY $v8m2
+ %2:_(s64) = COPY $x10
+ %1:_(s32) = G_TRUNC %2(s64)
+ %4:_(s64) = G_CONSTANT i64 0
+ %3:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT %0, %1(s32), %4(s64)
+ $v8m2 = COPY %3(<vscale x 4 x s32>)
+ PseudoRET implicit $v8m2
+...
+---
+name: insertelement_nxv1i64_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv1i64_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s64), [[C]](s64)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s64>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+ %2:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %2(s64)
+ $v8 = COPY %0(<vscale x 1 x s64>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv1i64_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv1i64_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s64), [[C1]](s64)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s64>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %1:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+ %2:_(s64) = G_CONSTANT i64 -1
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s64)
+ $v8 = COPY %0(<vscale x 1 x s64>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv1i64_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv1i64_2
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s64), [[C]](s64)
+ ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s64>)
+ ; CHECK-NEXT: PseudoRET implicit $v8
+ %0:_(s64) = COPY $x10
+ %2:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+ %3:_(s64) = G_CONSTANT i64 0
+ %1:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT %2, %0(s64), %3(s64)
+ $v8 = COPY %1(<vscale x 1 x s64>)
+ PseudoRET implicit $v8
+...
+---
+name: insertelement_nxv2i64_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv2i64_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s64), [[C]](s64)
+ ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 2 x s64>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m2
+ %1:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+ %2:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %2(s64)
+ $v8m2 = COPY %0(<vscale x 2 x s64>)
+ PseudoRET implicit $v8m2
+...
+---
+name: insertelement_nxv2i64_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv2i64_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s64), [[C1]](s64)
+ ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 2 x s64>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m2
+ %1:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+ %2:_(s64) = G_CONSTANT i64 -1
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s64)
+ $v8m2 = COPY %0(<vscale x 2 x s64>)
+ PseudoRET implicit $v8m2
+...
+---
+name: insertelement_nxv2i64_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv2i64_2
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s64), [[C]](s64)
+ ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 2 x s64>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m2
+ %0:_(s64) = COPY $x10
+ %2:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+ %3:_(s64) = G_CONSTANT i64 0
+ %1:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT %2, %0(s64), %3(s64)
+ $v8m2 = COPY %1(<vscale x 2 x s64>)
+ PseudoRET implicit $v8m2
+...
+---
+name: insertelement_nxv4i64_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv4i64_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s64), [[C]](s64)
+ ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 4 x s64>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m4
+ %1:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+ %2:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %2(s64)
+ $v8m4 = COPY %0(<vscale x 4 x s64>)
+ PseudoRET implicit $v8m4
+...
+---
+name: insertelement_nxv4i64_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv4i64_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s64), [[C1]](s64)
+ ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 4 x s64>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m4
+ %1:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+ %2:_(s64) = G_CONSTANT i64 -1
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s64)
+ $v8m4 = COPY %0(<vscale x 4 x s64>)
+ PseudoRET implicit $v8m4
+...
+---
+name: insertelement_nxv4i64_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv4i64_2
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s64), [[C]](s64)
+ ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 4 x s64>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m4
+ %0:_(s64) = COPY $x10
+ %2:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+ %3:_(s64) = G_CONSTANT i64 0
+ %1:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT %2, %0(s64), %3(s64)
+ $v8m4 = COPY %1(<vscale x 4 x s64>)
+ PseudoRET implicit $v8m4
+...
+---
+name: insertelement_nxv8i64_0
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv8i64_0
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s64), [[C]](s64)
+ ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 8 x s64>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m8
+ %1:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+ %2:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %2(s64)
+ $v8m8 = COPY %0(<vscale x 8 x s64>)
+ PseudoRET implicit $v8m8
+...
+---
+name: insertelement_nxv8i64_1
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: insertelement_nxv8i64_1
+ ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s64), [[C1]](s64)
+ ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 8 x s64>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m8
+ %1:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+ %2:_(s64) = G_CONSTANT i64 -1
+ %3:_(s64) = G_CONSTANT i64 0
+ %0:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s64)
+ $v8m8 = COPY %0(<vscale x 8 x s64>)
+ PseudoRET implicit $v8m8
+...
+---
+name: insertelement_nxv8i64_2
+legalized: false
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+
+ ; CHECK-LABEL: name: insertelement_nxv8i64_2
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s64), [[C]](s64)
+ ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 8 x s64>)
+ ; CHECK-NEXT: PseudoRET implicit $v8m8
+ %0:_(s64) = COPY $x10
+ %2:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+ %3:_(s64) = G_CONSTANT i64 0
+ %1:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT %2, %0(s64), %3(s64)
+ $v8m8 = COPY %1(<vscale x 8 x s64>)
+ PseudoRET implicit $v8m8
+...
diff --git a/llvm/test/CodeGen/RISCV/and-negpow2-cmp.ll b/llvm/test/CodeGen/RISCV/and-negpow2-cmp.ll
index 2a46a59..4f036d3 100644
--- a/llvm/test/CodeGen/RISCV/and-negpow2-cmp.ll
+++ b/llvm/test/CodeGen/RISCV/and-negpow2-cmp.ll
@@ -221,8 +221,8 @@ define i64 @test12(i64 %0) #0 {
;
; RV64-LABEL: test12:
; RV64: # %bb.0: # %entry
-; RV64-NEXT: addiw a0, a0, -16
-; RV64-NEXT: addi a0, a0, 13
+; RV64-NEXT: addi a0, a0, -16
+; RV64-NEXT: addiw a0, a0, 13
; RV64-NEXT: seqz a0, a0
; RV64-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index f3529b1..22c2d81 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -80,6 +80,7 @@
; RUN: llc -mtriple=riscv32 -mattr=+xwchc %s -o - | FileCheck --check-prefix=RV32XWCHC %s
; RUN: llc -mtriple=riscv32 -mattr=+zaamo %s -o - | FileCheck --check-prefix=RV32ZAAMO %s
; RUN: llc -mtriple=riscv32 -mattr=+zalrsc %s -o - | FileCheck --check-prefix=RV32ZALRSC %s
+; RUN: llc -mtriple=riscv32 -mattr=+zaamo,+zalrsc %s -o - | FileCheck --check-prefixes=CHECK,RV32COMBINEINTOA %s
; RUN: llc -mtriple=riscv32 -mattr=+zca %s -o - | FileCheck --check-prefixes=CHECK,RV32ZCA %s
; RUN: llc -mtriple=riscv32 -mattr=+zcb %s -o - | FileCheck --check-prefixes=CHECK,RV32ZCB %s
; RUN: llc -mtriple=riscv32 -mattr=+zcd %s -o - | FileCheck --check-prefixes=CHECK,RV32ZCD %s
@@ -227,6 +228,7 @@
; RUN: llc -mtriple=riscv64 -mattr=+ztso %s -o - | FileCheck --check-prefixes=CHECK,RV64ZTSO %s
; RUN: llc -mtriple=riscv64 -mattr=+zaamo %s -o - | FileCheck --check-prefix=RV64ZAAMO %s
; RUN: llc -mtriple=riscv64 -mattr=+zalrsc %s -o - | FileCheck --check-prefix=RV64ZALRSC %s
+; RUN: llc -mtriple=riscv64 -mattr=+zaamo,+zalrsc %s -o - | FileCheck --check-prefixes=CHECK,RV64COMBINEINTOA %s
; RUN: llc -mtriple=riscv64 -mattr=+zca %s -o - | FileCheck --check-prefixes=CHECK,RV64ZCA %s
; RUN: llc -mtriple=riscv64 -mattr=+zcb %s -o - | FileCheck --check-prefixes=CHECK,RV64ZCB %s
; RUN: llc -mtriple=riscv64 -mattr=+zcd %s -o - | FileCheck --check-prefixes=CHECK,RV64ZCD %s
@@ -392,6 +394,7 @@
; RV32XWCHC: .attribute 5, "rv32i2p1_zca1p0_xwchc2p2"
; RV32ZAAMO: .attribute 5, "rv32i2p1_zaamo1p0"
; RV32ZALRSC: .attribute 5, "rv32i2p1_zalrsc1p0"
+; RV32COMBINEINTOA: .attribute 5, "rv32i2p1_a2p1_zaamo1p0_zalrsc1p0"
; RV32ZCA: .attribute 5, "rv32i2p1_zca1p0"
; RV32ZCB: .attribute 5, "rv32i2p1_zca1p0_zcb1p0"
; RV32ZCD: .attribute 5, "rv32i2p1_f2p2_d2p2_zicsr2p0_zca1p0_zcd1p0"
@@ -537,6 +540,7 @@
; RV64ZTSO: .attribute 5, "rv64i2p1_ztso1p0"
; RV64ZAAMO: .attribute 5, "rv64i2p1_zaamo1p0"
; RV64ZALRSC: .attribute 5, "rv64i2p1_zalrsc1p0"
+; RV64COMBINEINTOA: .attribute 5, "rv64i2p1_a2p1_zaamo1p0_zalrsc1p0"
; RV64ZCA: .attribute 5, "rv64i2p1_zca1p0"
; RV64ZCB: .attribute 5, "rv64i2p1_zca1p0_zcb1p0"
; RV64ZCD: .attribute 5, "rv64i2p1_f2p2_d2p2_zicsr2p0_zca1p0_zcd1p0"
diff --git a/llvm/test/CodeGen/RISCV/branch-rel.mir b/llvm/test/CodeGen/RISCV/branch-rel.mir
new file mode 100644
index 0000000..1ed5f57
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/branch-rel.mir
@@ -0,0 +1,39 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc %s -mtriple=riscv64 -run-pass=branch-relaxation -o - -verify-machineinstrs | FileCheck %s
+
+--- |
+ define void @foo() {
+ ret void
+ }
+...
+---
+name: foo
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: foo
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.2(0x80000000)
+ ; CHECK-NEXT: liveins: $x1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: PseudoBR %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.2(0x80000000)
+ ; CHECK-NEXT: liveins: $x1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: INLINEASM &".space 4096", 1 /* sideeffect attdialect */
+ ; CHECK-NEXT: BGE $x1, $x0, %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: PseudoRET
+ bb.0:
+ liveins: $x1
+ BNE $x1, $x0, %bb.3
+ PseudoBR %bb.3
+ bb.1:
+ liveins: $x1
+ INLINEASM &".space 4096", 1
+ BGE $x1, $x0, %bb.3
+ bb.3:
+ PseudoRET
+## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
diff --git a/llvm/test/CodeGen/RISCV/div_minsize.ll b/llvm/test/CodeGen/RISCV/div_minsize.ll
index 601821b..794af2f 100644
--- a/llvm/test/CodeGen/RISCV/div_minsize.ll
+++ b/llvm/test/CodeGen/RISCV/div_minsize.ll
@@ -68,3 +68,151 @@ define i32 @testsize4(i32 %x) minsize nounwind {
%div = udiv i32 %x, 33
ret i32 %div
}
+
+define i128 @i128_sdiv(i128 %arg0) minsize nounwind {
+; RV32IM-LABEL: i128_sdiv:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: lw a2, 12(a1)
+; RV32IM-NEXT: lw a3, 8(a1)
+; RV32IM-NEXT: lw a4, 0(a1)
+; RV32IM-NEXT: lw a1, 4(a1)
+; RV32IM-NEXT: srai a5, a2, 31
+; RV32IM-NEXT: srli a5, a5, 30
+; RV32IM-NEXT: add a5, a4, a5
+; RV32IM-NEXT: sltu a4, a5, a4
+; RV32IM-NEXT: srli a5, a5, 2
+; RV32IM-NEXT: add a6, a1, a4
+; RV32IM-NEXT: sltu a1, a6, a1
+; RV32IM-NEXT: and a1, a4, a1
+; RV32IM-NEXT: srli a4, a6, 2
+; RV32IM-NEXT: slli a6, a6, 30
+; RV32IM-NEXT: or a5, a5, a6
+; RV32IM-NEXT: add a1, a3, a1
+; RV32IM-NEXT: srli a6, a1, 2
+; RV32IM-NEXT: sltu a3, a1, a3
+; RV32IM-NEXT: slli a1, a1, 30
+; RV32IM-NEXT: add a2, a2, a3
+; RV32IM-NEXT: or a1, a4, a1
+; RV32IM-NEXT: slli a3, a2, 30
+; RV32IM-NEXT: srai a2, a2, 2
+; RV32IM-NEXT: or a3, a6, a3
+; RV32IM-NEXT: sw a5, 0(a0)
+; RV32IM-NEXT: sw a1, 4(a0)
+; RV32IM-NEXT: sw a3, 8(a0)
+; RV32IM-NEXT: sw a2, 12(a0)
+; RV32IM-NEXT: ret
+;
+; RV64IM-LABEL: i128_sdiv:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: addi sp, sp, -16
+; RV64IM-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: li a2, 4
+; RV64IM-NEXT: li a3, 0
+; RV64IM-NEXT: call __divti3
+; RV64IM-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: addi sp, sp, 16
+; RV64IM-NEXT: ret
+ %div = sdiv i128 %arg0, 4
+ ret i128 %div
+}
+
+define i256 @i256_sdiv(i256 %arg0) minsize nounwind {
+; RV32IM-LABEL: i256_sdiv:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: lw a5, 16(a1)
+; RV32IM-NEXT: lw a4, 20(a1)
+; RV32IM-NEXT: lw a2, 24(a1)
+; RV32IM-NEXT: lw a3, 28(a1)
+; RV32IM-NEXT: lw a6, 0(a1)
+; RV32IM-NEXT: lw a7, 4(a1)
+; RV32IM-NEXT: lw t0, 8(a1)
+; RV32IM-NEXT: lw t1, 12(a1)
+; RV32IM-NEXT: srai a1, a3, 31
+; RV32IM-NEXT: srli a1, a1, 30
+; RV32IM-NEXT: add a1, a6, a1
+; RV32IM-NEXT: sltu t2, a1, a6
+; RV32IM-NEXT: add a6, a7, t2
+; RV32IM-NEXT: sltu a7, a6, a7
+; RV32IM-NEXT: and t2, t2, a7
+; RV32IM-NEXT: add a7, t0, t2
+; RV32IM-NEXT: sltu t3, a7, t0
+; RV32IM-NEXT: add t0, t1, t3
+; RV32IM-NEXT: beqz t2, .LBB5_2
+; RV32IM-NEXT: # %bb.1:
+; RV32IM-NEXT: sltu t1, t0, t1
+; RV32IM-NEXT: and t2, t3, t1
+; RV32IM-NEXT: .LBB5_2:
+; RV32IM-NEXT: add t2, a5, t2
+; RV32IM-NEXT: srli t1, t0, 2
+; RV32IM-NEXT: srli t3, a7, 2
+; RV32IM-NEXT: slli t0, t0, 30
+; RV32IM-NEXT: slli a7, a7, 30
+; RV32IM-NEXT: or t0, t3, t0
+; RV32IM-NEXT: srli t3, a6, 2
+; RV32IM-NEXT: srli a1, a1, 2
+; RV32IM-NEXT: slli a6, a6, 30
+; RV32IM-NEXT: sltu a5, t2, a5
+; RV32IM-NEXT: or a7, t3, a7
+; RV32IM-NEXT: srli t3, t2, 2
+; RV32IM-NEXT: slli t2, t2, 30
+; RV32IM-NEXT: or a1, a1, a6
+; RV32IM-NEXT: add a6, a4, a5
+; RV32IM-NEXT: or t1, t1, t2
+; RV32IM-NEXT: sltu a4, a6, a4
+; RV32IM-NEXT: srli t2, a6, 2
+; RV32IM-NEXT: slli a6, a6, 30
+; RV32IM-NEXT: sw a1, 0(a0)
+; RV32IM-NEXT: sw a7, 4(a0)
+; RV32IM-NEXT: sw t0, 8(a0)
+; RV32IM-NEXT: sw t1, 12(a0)
+; RV32IM-NEXT: and a4, a5, a4
+; RV32IM-NEXT: or a1, t3, a6
+; RV32IM-NEXT: add a4, a2, a4
+; RV32IM-NEXT: srli a5, a4, 2
+; RV32IM-NEXT: sltu a2, a4, a2
+; RV32IM-NEXT: slli a4, a4, 30
+; RV32IM-NEXT: add a2, a3, a2
+; RV32IM-NEXT: or a3, t2, a4
+; RV32IM-NEXT: slli a4, a2, 30
+; RV32IM-NEXT: srai a2, a2, 2
+; RV32IM-NEXT: or a4, a5, a4
+; RV32IM-NEXT: sw a1, 16(a0)
+; RV32IM-NEXT: sw a3, 20(a0)
+; RV32IM-NEXT: sw a4, 24(a0)
+; RV32IM-NEXT: sw a2, 28(a0)
+; RV32IM-NEXT: ret
+;
+; RV64IM-LABEL: i256_sdiv:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: ld a2, 24(a1)
+; RV64IM-NEXT: ld a3, 16(a1)
+; RV64IM-NEXT: ld a4, 0(a1)
+; RV64IM-NEXT: ld a1, 8(a1)
+; RV64IM-NEXT: srai a5, a2, 63
+; RV64IM-NEXT: srli a5, a5, 62
+; RV64IM-NEXT: add a5, a4, a5
+; RV64IM-NEXT: sltu a4, a5, a4
+; RV64IM-NEXT: srli a5, a5, 2
+; RV64IM-NEXT: add a6, a1, a4
+; RV64IM-NEXT: sltu a1, a6, a1
+; RV64IM-NEXT: and a1, a4, a1
+; RV64IM-NEXT: srli a4, a6, 2
+; RV64IM-NEXT: slli a6, a6, 62
+; RV64IM-NEXT: or a5, a5, a6
+; RV64IM-NEXT: add a1, a3, a1
+; RV64IM-NEXT: srli a6, a1, 2
+; RV64IM-NEXT: sltu a3, a1, a3
+; RV64IM-NEXT: slli a1, a1, 62
+; RV64IM-NEXT: add a2, a2, a3
+; RV64IM-NEXT: or a1, a4, a1
+; RV64IM-NEXT: slli a3, a2, 62
+; RV64IM-NEXT: srai a2, a2, 2
+; RV64IM-NEXT: or a3, a6, a3
+; RV64IM-NEXT: sd a5, 0(a0)
+; RV64IM-NEXT: sd a1, 8(a0)
+; RV64IM-NEXT: sd a3, 16(a0)
+; RV64IM-NEXT: sd a2, 24(a0)
+; RV64IM-NEXT: ret
+ %div = sdiv i256 %arg0, 4
+ ret i256 %div
+}
diff --git a/llvm/test/CodeGen/RISCV/i64-icmp.ll b/llvm/test/CodeGen/RISCV/i64-icmp.ll
index 88d989d..2742b9a 100644
--- a/llvm/test/CodeGen/RISCV/i64-icmp.ll
+++ b/llvm/test/CodeGen/RISCV/i64-icmp.ll
@@ -708,8 +708,7 @@ define i64 @icmp_sle_constant_neg_2050(i64 %a) nounwind {
define i64 @icmp_eq_zext_inreg_small_constant(i64 %a) nounwind {
; RV64I-LABEL: icmp_eq_zext_inreg_small_constant:
; RV64I: # %bb.0:
-; RV64I-NEXT: sext.w a0, a0
-; RV64I-NEXT: addi a0, a0, -123
+; RV64I-NEXT: addiw a0, a0, -123
; RV64I-NEXT: seqz a0, a0
; RV64I-NEXT: ret
%1 = and i64 %a, 4294967295
@@ -748,8 +747,7 @@ define i64 @icmp_ne_zext_inreg_small_constant(i64 %a) nounwind {
define i64 @icmp_ne_zext_inreg_large_constant(i64 %a) nounwind {
; RV64I-LABEL: icmp_ne_zext_inreg_large_constant:
; RV64I: # %bb.0:
-; RV64I-NEXT: sext.w a0, a0
-; RV64I-NEXT: addi a0, a0, 2
+; RV64I-NEXT: addiw a0, a0, 2
; RV64I-NEXT: snez a0, a0
; RV64I-NEXT: ret
%1 = and i64 %a, 4294967295
diff --git a/llvm/test/CodeGen/RISCV/idiv_large.ll b/llvm/test/CodeGen/RISCV/idiv_large.ll
index 9937627..d7b00f6 100644
--- a/llvm/test/CodeGen/RISCV/idiv_large.ll
+++ b/llvm/test/CodeGen/RISCV/idiv_large.ll
@@ -1,16 +1,2315 @@
-; RUN: llc -mtriple=riscv32 < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=riscv32 < %s | FileCheck %s --check-prefix=RV32
+; RUN: llc -mtriple=riscv64 < %s | FileCheck %s --check-prefix=RV64
+
+define i64 @udiv_i64(i64 %x, i64 %y) nounwind {
+; RV32-LABEL: udiv_i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: call __udivdi3
+; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: udiv_i64:
+; RV64: # %bb.0:
+; RV64-NEXT: tail __udivdi3
+ %res = udiv i64 %x, %y
+ ret i64 %res
+}
+
+define i65 @udiv_i65(i65 %x, i65 %y) nounwind {
+; RV32-LABEL: udiv_i65:
+; RV32: # %bb.0: # %_udiv-special-cases
+; RV32-NEXT: lw a3, 0(a2)
+; RV32-NEXT: lw a4, 4(a2)
+; RV32-NEXT: lw t1, 8(a2)
+; RV32-NEXT: lui a2, 349525
+; RV32-NEXT: lui a5, 209715
+; RV32-NEXT: lui a6, 61681
+; RV32-NEXT: addi t0, a2, 1365
+; RV32-NEXT: addi a7, a5, 819
+; RV32-NEXT: addi a6, a6, -241
+; RV32-NEXT: srli a2, a4, 1
+; RV32-NEXT: slli a5, t1, 31
+; RV32-NEXT: slli t3, a4, 31
+; RV32-NEXT: or t2, a5, a2
+; RV32-NEXT: srli a2, a3, 1
+; RV32-NEXT: or t4, a2, t3
+; RV32-NEXT: bnez t2, .LBB1_2
+; RV32-NEXT: # %bb.1: # %_udiv-special-cases
+; RV32-NEXT: srli a2, t4, 1
+; RV32-NEXT: or a2, t4, a2
+; RV32-NEXT: srli a5, a2, 2
+; RV32-NEXT: or a2, a2, a5
+; RV32-NEXT: srli a5, a2, 4
+; RV32-NEXT: or a2, a2, a5
+; RV32-NEXT: srli a5, a2, 8
+; RV32-NEXT: or a2, a2, a5
+; RV32-NEXT: srli a5, a2, 16
+; RV32-NEXT: or a2, a2, a5
+; RV32-NEXT: not a2, a2
+; RV32-NEXT: srli a5, a2, 1
+; RV32-NEXT: and a5, a5, t0
+; RV32-NEXT: sub a2, a2, a5
+; RV32-NEXT: and a5, a2, a7
+; RV32-NEXT: srli a2, a2, 2
+; RV32-NEXT: and a2, a2, a7
+; RV32-NEXT: add a2, a5, a2
+; RV32-NEXT: srli a5, a2, 4
+; RV32-NEXT: add a2, a2, a5
+; RV32-NEXT: and a2, a2, a6
+; RV32-NEXT: slli a5, a2, 8
+; RV32-NEXT: add a2, a2, a5
+; RV32-NEXT: slli a5, a2, 16
+; RV32-NEXT: add a2, a2, a5
+; RV32-NEXT: srli a2, a2, 24
+; RV32-NEXT: addi t3, a2, 32
+; RV32-NEXT: j .LBB1_3
+; RV32-NEXT: .LBB1_2:
+; RV32-NEXT: srli a2, t2, 1
+; RV32-NEXT: or a2, t2, a2
+; RV32-NEXT: srli a5, a2, 2
+; RV32-NEXT: or a2, a2, a5
+; RV32-NEXT: srli a5, a2, 4
+; RV32-NEXT: or a2, a2, a5
+; RV32-NEXT: srli a5, a2, 8
+; RV32-NEXT: or a2, a2, a5
+; RV32-NEXT: srli a5, a2, 16
+; RV32-NEXT: or a2, a2, a5
+; RV32-NEXT: not a2, a2
+; RV32-NEXT: srli a5, a2, 1
+; RV32-NEXT: and a5, a5, t0
+; RV32-NEXT: sub a2, a2, a5
+; RV32-NEXT: and a5, a2, a7
+; RV32-NEXT: srli a2, a2, 2
+; RV32-NEXT: and a2, a2, a7
+; RV32-NEXT: add a2, a5, a2
+; RV32-NEXT: srli a5, a2, 4
+; RV32-NEXT: add a2, a2, a5
+; RV32-NEXT: and a2, a2, a6
+; RV32-NEXT: slli a5, a2, 8
+; RV32-NEXT: add a2, a2, a5
+; RV32-NEXT: slli a5, a2, 16
+; RV32-NEXT: add a2, a2, a5
+; RV32-NEXT: srli t3, a2, 24
+; RV32-NEXT: .LBB1_3: # %_udiv-special-cases
+; RV32-NEXT: addi sp, sp, -96
+; RV32-NEXT: sw s0, 92(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s1, 88(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s2, 84(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s3, 80(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s4, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s5, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s6, 68(sp) # 4-byte Folded Spill
+; RV32-NEXT: slli a2, a3, 31
+; RV32-NEXT: li t5, 64
+; RV32-NEXT: bnez a2, .LBB1_5
+; RV32-NEXT: # %bb.4: # %_udiv-special-cases
+; RV32-NEXT: li s0, 64
+; RV32-NEXT: j .LBB1_6
+; RV32-NEXT: .LBB1_5:
+; RV32-NEXT: srli a5, a2, 1
+; RV32-NEXT: or a2, a2, a5
+; RV32-NEXT: srli a5, a2, 2
+; RV32-NEXT: or a2, a2, a5
+; RV32-NEXT: srli a5, a2, 4
+; RV32-NEXT: or a2, a2, a5
+; RV32-NEXT: srli a5, a2, 8
+; RV32-NEXT: or a2, a2, a5
+; RV32-NEXT: srli a5, a2, 16
+; RV32-NEXT: or a2, a2, a5
+; RV32-NEXT: not a2, a2
+; RV32-NEXT: srli a5, a2, 1
+; RV32-NEXT: and a5, a5, t0
+; RV32-NEXT: sub a2, a2, a5
+; RV32-NEXT: and a5, a2, a7
+; RV32-NEXT: srli a2, a2, 2
+; RV32-NEXT: and a2, a2, a7
+; RV32-NEXT: add a2, a5, a2
+; RV32-NEXT: srli a5, a2, 4
+; RV32-NEXT: add a2, a2, a5
+; RV32-NEXT: and a2, a2, a6
+; RV32-NEXT: slli a5, a2, 8
+; RV32-NEXT: add a2, a2, a5
+; RV32-NEXT: slli a5, a2, 16
+; RV32-NEXT: add a2, a2, a5
+; RV32-NEXT: srli s0, a2, 24
+; RV32-NEXT: .LBB1_6: # %_udiv-special-cases
+; RV32-NEXT: lw a5, 0(a1)
+; RV32-NEXT: lw a2, 4(a1)
+; RV32-NEXT: lw s2, 8(a1)
+; RV32-NEXT: or a1, t4, t2
+; RV32-NEXT: addi s1, s0, 64
+; RV32-NEXT: bnez a1, .LBB1_8
+; RV32-NEXT: # %bb.7: # %_udiv-special-cases
+; RV32-NEXT: mv t3, s1
+; RV32-NEXT: .LBB1_8: # %_udiv-special-cases
+; RV32-NEXT: snez s4, a1
+; RV32-NEXT: srli a1, a2, 1
+; RV32-NEXT: slli t2, s2, 31
+; RV32-NEXT: slli t4, a2, 31
+; RV32-NEXT: or a1, t2, a1
+; RV32-NEXT: srli t2, a5, 1
+; RV32-NEXT: or t6, t2, t4
+; RV32-NEXT: bnez a1, .LBB1_10
+; RV32-NEXT: # %bb.9: # %_udiv-special-cases
+; RV32-NEXT: srli t2, t6, 1
+; RV32-NEXT: or t2, t6, t2
+; RV32-NEXT: srli t4, t2, 2
+; RV32-NEXT: or t2, t2, t4
+; RV32-NEXT: srli t4, t2, 4
+; RV32-NEXT: or t2, t2, t4
+; RV32-NEXT: srli t4, t2, 8
+; RV32-NEXT: or t2, t2, t4
+; RV32-NEXT: srli t4, t2, 16
+; RV32-NEXT: or t2, t2, t4
+; RV32-NEXT: not t2, t2
+; RV32-NEXT: srli t4, t2, 1
+; RV32-NEXT: and t4, t4, t0
+; RV32-NEXT: sub t2, t2, t4
+; RV32-NEXT: and t4, t2, a7
+; RV32-NEXT: srli t2, t2, 2
+; RV32-NEXT: and t2, t2, a7
+; RV32-NEXT: add t2, t4, t2
+; RV32-NEXT: srli t4, t2, 4
+; RV32-NEXT: add t2, t2, t4
+; RV32-NEXT: and t2, t2, a6
+; RV32-NEXT: slli t4, t2, 8
+; RV32-NEXT: add t2, t2, t4
+; RV32-NEXT: slli t4, t2, 16
+; RV32-NEXT: add t2, t2, t4
+; RV32-NEXT: srli t2, t2, 24
+; RV32-NEXT: addi s3, t2, 32
+; RV32-NEXT: j .LBB1_11
+; RV32-NEXT: .LBB1_10:
+; RV32-NEXT: srli t2, a1, 1
+; RV32-NEXT: or t2, a1, t2
+; RV32-NEXT: srli t4, t2, 2
+; RV32-NEXT: or t2, t2, t4
+; RV32-NEXT: srli t4, t2, 4
+; RV32-NEXT: or t2, t2, t4
+; RV32-NEXT: srli t4, t2, 8
+; RV32-NEXT: or t2, t2, t4
+; RV32-NEXT: srli t4, t2, 16
+; RV32-NEXT: or t2, t2, t4
+; RV32-NEXT: not t2, t2
+; RV32-NEXT: srli t4, t2, 1
+; RV32-NEXT: and t4, t4, t0
+; RV32-NEXT: sub t2, t2, t4
+; RV32-NEXT: and t4, t2, a7
+; RV32-NEXT: srli t2, t2, 2
+; RV32-NEXT: and t2, t2, a7
+; RV32-NEXT: add t2, t4, t2
+; RV32-NEXT: srli t4, t2, 4
+; RV32-NEXT: add t2, t2, t4
+; RV32-NEXT: and t2, t2, a6
+; RV32-NEXT: slli t4, t2, 8
+; RV32-NEXT: add t2, t2, t4
+; RV32-NEXT: slli t4, t2, 16
+; RV32-NEXT: add t2, t2, t4
+; RV32-NEXT: srli s3, t2, 24
+; RV32-NEXT: .LBB1_11: # %_udiv-special-cases
+; RV32-NEXT: andi t4, s2, 1
+; RV32-NEXT: andi t1, t1, 1
+; RV32-NEXT: or t2, a3, a4
+; RV32-NEXT: or s2, a5, a2
+; RV32-NEXT: sltu s0, s1, s0
+; RV32-NEXT: slli s1, a5, 31
+; RV32-NEXT: addi s4, s4, -1
+; RV32-NEXT: beqz s1, .LBB1_13
+; RV32-NEXT: # %bb.12:
+; RV32-NEXT: srli t5, s1, 1
+; RV32-NEXT: or t5, s1, t5
+; RV32-NEXT: srli s1, t5, 2
+; RV32-NEXT: or t5, t5, s1
+; RV32-NEXT: srli s1, t5, 4
+; RV32-NEXT: or t5, t5, s1
+; RV32-NEXT: srli s1, t5, 8
+; RV32-NEXT: or t5, t5, s1
+; RV32-NEXT: srli s1, t5, 16
+; RV32-NEXT: or t5, t5, s1
+; RV32-NEXT: not t5, t5
+; RV32-NEXT: srli s1, t5, 1
+; RV32-NEXT: and t0, s1, t0
+; RV32-NEXT: sub t0, t5, t0
+; RV32-NEXT: and t5, t0, a7
+; RV32-NEXT: srli t0, t0, 2
+; RV32-NEXT: and a7, t0, a7
+; RV32-NEXT: add a7, t5, a7
+; RV32-NEXT: srli t0, a7, 4
+; RV32-NEXT: add a7, a7, t0
+; RV32-NEXT: and a6, a7, a6
+; RV32-NEXT: slli a7, a6, 8
+; RV32-NEXT: add a6, a6, a7
+; RV32-NEXT: slli a7, a6, 16
+; RV32-NEXT: add a6, a6, a7
+; RV32-NEXT: srli t5, a6, 24
+; RV32-NEXT: .LBB1_13: # %_udiv-special-cases
+; RV32-NEXT: or t0, t2, t1
+; RV32-NEXT: or a6, s2, t4
+; RV32-NEXT: and a7, s4, s0
+; RV32-NEXT: or t6, t6, a1
+; RV32-NEXT: addi s0, t5, 64
+; RV32-NEXT: bnez t6, .LBB1_15
+; RV32-NEXT: # %bb.14: # %_udiv-special-cases
+; RV32-NEXT: mv s3, s0
+; RV32-NEXT: .LBB1_15: # %_udiv-special-cases
+; RV32-NEXT: seqz a1, t0
+; RV32-NEXT: sltu t0, s0, t5
+; RV32-NEXT: snez t5, t6
+; RV32-NEXT: addi t5, t5, -1
+; RV32-NEXT: and t0, t5, t0
+; RV32-NEXT: sltu t5, t3, s3
+; RV32-NEXT: seqz a6, a6
+; RV32-NEXT: mv t6, t5
+; RV32-NEXT: beq a7, t0, .LBB1_17
+; RV32-NEXT: # %bb.16: # %_udiv-special-cases
+; RV32-NEXT: sltu t6, a7, t0
+; RV32-NEXT: .LBB1_17: # %_udiv-special-cases
+; RV32-NEXT: or a1, a1, a6
+; RV32-NEXT: andi a6, t6, 1
+; RV32-NEXT: sub a7, a7, t0
+; RV32-NEXT: sub t5, a7, t5
+; RV32-NEXT: sub a7, t3, s3
+; RV32-NEXT: beqz a6, .LBB1_19
+; RV32-NEXT: # %bb.18: # %_udiv-special-cases
+; RV32-NEXT: mv t0, a6
+; RV32-NEXT: j .LBB1_20
+; RV32-NEXT: .LBB1_19:
+; RV32-NEXT: sltiu t0, a7, 65
+; RV32-NEXT: xori t0, t0, 1
+; RV32-NEXT: snez t3, t5
+; RV32-NEXT: or t0, t0, t3
+; RV32-NEXT: .LBB1_20: # %_udiv-special-cases
+; RV32-NEXT: or t6, a1, t0
+; RV32-NEXT: addi a1, t6, -1
+; RV32-NEXT: and t3, t4, a1
+; RV32-NEXT: and t0, a1, a2
+; RV32-NEXT: and a1, a1, a5
+; RV32-NEXT: bnez t6, .LBB1_30
+; RV32-NEXT: # %bb.21: # %_udiv-special-cases
+; RV32-NEXT: xori t6, a7, 64
+; RV32-NEXT: or t6, t6, a6
+; RV32-NEXT: or t6, t6, t5
+; RV32-NEXT: beqz t6, .LBB1_30
+; RV32-NEXT: # %bb.22: # %udiv-bb1
+; RV32-NEXT: addi a1, a7, 1
+; RV32-NEXT: sw zero, 32(sp)
+; RV32-NEXT: sw zero, 36(sp)
+; RV32-NEXT: sw zero, 40(sp)
+; RV32-NEXT: sw zero, 44(sp)
+; RV32-NEXT: sw a5, 48(sp)
+; RV32-NEXT: sw a2, 52(sp)
+; RV32-NEXT: sw t4, 56(sp)
+; RV32-NEXT: li t0, 64
+; RV32-NEXT: addi t3, sp, 48
+; RV32-NEXT: neg s1, a7
+; RV32-NEXT: seqz t6, a1
+; RV32-NEXT: sub a7, t0, a7
+; RV32-NEXT: add t5, t5, t6
+; RV32-NEXT: andi t0, a7, 31
+; RV32-NEXT: srli a7, a7, 3
+; RV32-NEXT: or t6, a1, t5
+; RV32-NEXT: xori s2, t0, 31
+; RV32-NEXT: andi a7, a7, 12
+; RV32-NEXT: seqz t0, t6
+; RV32-NEXT: sub s3, t3, a7
+; RV32-NEXT: add a6, a6, t0
+; RV32-NEXT: lw t3, 0(s3)
+; RV32-NEXT: lw s4, 4(s3)
+; RV32-NEXT: andi a7, a6, 1
+; RV32-NEXT: or t6, t6, a7
+; RV32-NEXT: srli a6, t3, 1
+; RV32-NEXT: sll t0, s4, s1
+; RV32-NEXT: srl a6, a6, s2
+; RV32-NEXT: or t0, t0, a6
+; RV32-NEXT: sll a6, t3, s1
+; RV32-NEXT: li t3, 0
+; RV32-NEXT: beqz t6, .LBB1_28
+; RV32-NEXT: # %bb.23: # %udiv-preheader
+; RV32-NEXT: li t6, 0
+; RV32-NEXT: li s0, 0
+; RV32-NEXT: srli s4, s4, 1
+; RV32-NEXT: lw s3, 8(s3)
+; RV32-NEXT: sw zero, 16(sp)
+; RV32-NEXT: sw zero, 20(sp)
+; RV32-NEXT: sw zero, 24(sp)
+; RV32-NEXT: sw zero, 28(sp)
+; RV32-NEXT: sw a5, 0(sp)
+; RV32-NEXT: sw a2, 4(sp)
+; RV32-NEXT: sw t4, 8(sp)
+; RV32-NEXT: sw zero, 12(sp)
+; RV32-NEXT: srli a2, a1, 3
+; RV32-NEXT: srl a5, s4, s2
+; RV32-NEXT: mv t4, sp
+; RV32-NEXT: snez t2, t2
+; RV32-NEXT: andi a2, a2, 12
+; RV32-NEXT: add t1, t1, t2
+; RV32-NEXT: add a2, t4, a2
+; RV32-NEXT: lw t2, 0(a2)
+; RV32-NEXT: lw t4, 4(a2)
+; RV32-NEXT: lw a2, 8(a2)
+; RV32-NEXT: sll s1, s3, s1
+; RV32-NEXT: andi s2, a1, 31
+; RV32-NEXT: xori s2, s2, 31
+; RV32-NEXT: or s3, s1, a5
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: slli a5, t4, 1
+; RV32-NEXT: sll a2, a2, s2
+; RV32-NEXT: sll s2, a5, s2
+; RV32-NEXT: srl s1, t4, a1
+; RV32-NEXT: or s1, s1, a2
+; RV32-NEXT: seqz a2, a3
+; RV32-NEXT: sub a2, a4, a2
+; RV32-NEXT: addi a5, t1, 1
+; RV32-NEXT: andi a5, a5, 1
+; RV32-NEXT: andi s3, s3, 1
+; RV32-NEXT: srl t1, t2, a1
+; RV32-NEXT: or s2, t1, s2
+; RV32-NEXT: addi t1, a3, -1
+; RV32-NEXT: j .LBB1_26
+; RV32-NEXT: .LBB1_24: # %udiv-do-while
+; RV32-NEXT: # in Loop: Header=BB1_26 Depth=1
+; RV32-NEXT: sltu t2, a2, s4
+; RV32-NEXT: .LBB1_25: # %udiv-do-while
+; RV32-NEXT: # in Loop: Header=BB1_26 Depth=1
+; RV32-NEXT: srli s1, s1, 31
+; RV32-NEXT: sub t4, a5, s1
+; RV32-NEXT: sub t2, t4, t2
+; RV32-NEXT: slli t2, t2, 31
+; RV32-NEXT: srai s1, t2, 31
+; RV32-NEXT: and s3, s1, a4
+; RV32-NEXT: li t2, 0
+; RV32-NEXT: li t4, 0
+; RV32-NEXT: srli s5, a6, 31
+; RV32-NEXT: sub s4, s4, s3
+; RV32-NEXT: slli s3, t0, 1
+; RV32-NEXT: or s3, s3, s5
+; RV32-NEXT: srli t0, t0, 31
+; RV32-NEXT: slli a6, a6, 1
+; RV32-NEXT: or a6, t3, a6
+; RV32-NEXT: seqz t3, a1
+; RV32-NEXT: or s0, s0, t0
+; RV32-NEXT: or s5, a1, t5
+; RV32-NEXT: sub t5, t5, t3
+; RV32-NEXT: and s6, s1, a3
+; RV32-NEXT: addi a1, a1, -1
+; RV32-NEXT: andi t3, s1, 1
+; RV32-NEXT: or t0, t6, s3
+; RV32-NEXT: sltu t6, s2, s6
+; RV32-NEXT: snez s5, s5
+; RV32-NEXT: andi s3, s0, 1
+; RV32-NEXT: sub s1, s4, t6
+; RV32-NEXT: add a7, a7, s5
+; RV32-NEXT: addi a7, a7, 1
+; RV32-NEXT: andi a7, a7, 1
+; RV32-NEXT: or t6, a1, t5
+; RV32-NEXT: or s4, t6, a7
+; RV32-NEXT: sub s2, s2, s6
+; RV32-NEXT: li t6, 0
+; RV32-NEXT: li s0, 0
+; RV32-NEXT: beqz s4, .LBB1_29
+; RV32-NEXT: .LBB1_26: # %udiv-do-while
+; RV32-NEXT: # =>This Inner Loop Header: Depth=1
+; RV32-NEXT: srli t2, s2, 31
+; RV32-NEXT: slli t4, s1, 1
+; RV32-NEXT: slli s2, s2, 1
+; RV32-NEXT: or s4, t4, t2
+; RV32-NEXT: andi t2, s3, 1
+; RV32-NEXT: or s2, s2, t2
+; RV32-NEXT: bne a2, s4, .LBB1_24
+; RV32-NEXT: # %bb.27: # in Loop: Header=BB1_26 Depth=1
+; RV32-NEXT: sltu t2, t1, s2
+; RV32-NEXT: j .LBB1_25
+; RV32-NEXT: .LBB1_28:
+; RV32-NEXT: li t2, 0
+; RV32-NEXT: li t4, 0
+; RV32-NEXT: .LBB1_29: # %udiv-loop-exit
+; RV32-NEXT: srli a2, a6, 31
+; RV32-NEXT: slli a3, t0, 1
+; RV32-NEXT: srli a4, t0, 31
+; RV32-NEXT: slli a6, a6, 1
+; RV32-NEXT: or a1, t3, a6
+; RV32-NEXT: or a2, t2, a2
+; RV32-NEXT: or a4, t4, a4
+; RV32-NEXT: or t0, a2, a3
+; RV32-NEXT: andi t3, a4, 1
+; RV32-NEXT: .LBB1_30: # %udiv-end
+; RV32-NEXT: andi a2, t3, 1
+; RV32-NEXT: sw a1, 0(a0)
+; RV32-NEXT: sw t0, 4(a0)
+; RV32-NEXT: sb a2, 8(a0)
+; RV32-NEXT: lw s0, 92(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s1, 88(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s2, 84(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s3, 80(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s4, 76(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s5, 72(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s6, 68(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 96
+; RV32-NEXT: ret
+;
+; RV64-LABEL: udiv_i65:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT: andi a1, a1, 1
+; RV64-NEXT: andi a3, a3, 1
+; RV64-NEXT: call __udivti3
+; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 16
+; RV64-NEXT: ret
+ %res = udiv i65 %x, %y
+ ret i65 %res
+}
define i128 @udiv_i128(i128 %x, i128 %y) nounwind {
-; CHECK-LABEL: udiv_i128:
-; CHECK: call __udivti3
+; RV32-LABEL: udiv_i128:
+; RV32: # %bb.0: # %_udiv-special-cases
+; RV32-NEXT: addi sp, sp, -160
+; RV32-NEXT: sw ra, 156(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 152(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s1, 148(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s2, 144(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s3, 140(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s4, 136(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s5, 132(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s6, 128(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s7, 124(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s8, 120(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s9, 116(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s10, 112(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s11, 108(sp) # 4-byte Folded Spill
+; RV32-NEXT: mv s7, a0
+; RV32-NEXT: lw s8, 0(a2)
+; RV32-NEXT: lw s9, 4(a2)
+; RV32-NEXT: lw s11, 8(a2)
+; RV32-NEXT: lw ra, 12(a2)
+; RV32-NEXT: lui t4, 349525
+; RV32-NEXT: addi t4, t4, 1365
+; RV32-NEXT: lui t3, 209715
+; RV32-NEXT: addi t3, t3, 819
+; RV32-NEXT: lui t2, 61681
+; RV32-NEXT: addi t2, t2, -241
+; RV32-NEXT: bnez s9, .LBB2_2
+; RV32-NEXT: # %bb.1: # %_udiv-special-cases
+; RV32-NEXT: srli a0, s8, 1
+; RV32-NEXT: or a0, s8, a0
+; RV32-NEXT: srli a3, a0, 2
+; RV32-NEXT: or a0, a0, a3
+; RV32-NEXT: srli a3, a0, 4
+; RV32-NEXT: or a0, a0, a3
+; RV32-NEXT: srli a3, a0, 8
+; RV32-NEXT: or a0, a0, a3
+; RV32-NEXT: srli a3, a0, 16
+; RV32-NEXT: or a0, a0, a3
+; RV32-NEXT: not a0, a0
+; RV32-NEXT: srli a3, a0, 1
+; RV32-NEXT: and a3, a3, t4
+; RV32-NEXT: sub a0, a0, a3
+; RV32-NEXT: and a3, a0, t3
+; RV32-NEXT: srli a0, a0, 2
+; RV32-NEXT: and a0, a0, t3
+; RV32-NEXT: add a0, a3, a0
+; RV32-NEXT: srli a3, a0, 4
+; RV32-NEXT: add a0, a0, a3
+; RV32-NEXT: and a0, a0, t2
+; RV32-NEXT: slli a3, a0, 8
+; RV32-NEXT: add a0, a0, a3
+; RV32-NEXT: slli a3, a0, 16
+; RV32-NEXT: add a0, a0, a3
+; RV32-NEXT: srli a0, a0, 24
+; RV32-NEXT: addi t6, a0, 32
+; RV32-NEXT: j .LBB2_3
+; RV32-NEXT: .LBB2_2:
+; RV32-NEXT: srli a0, s9, 1
+; RV32-NEXT: or a0, s9, a0
+; RV32-NEXT: srli a3, a0, 2
+; RV32-NEXT: or a0, a0, a3
+; RV32-NEXT: srli a3, a0, 4
+; RV32-NEXT: or a0, a0, a3
+; RV32-NEXT: srli a3, a0, 8
+; RV32-NEXT: or a0, a0, a3
+; RV32-NEXT: srli a3, a0, 16
+; RV32-NEXT: or a0, a0, a3
+; RV32-NEXT: not a0, a0
+; RV32-NEXT: srli a3, a0, 1
+; RV32-NEXT: and a3, a3, t4
+; RV32-NEXT: sub a0, a0, a3
+; RV32-NEXT: and a3, a0, t3
+; RV32-NEXT: srli a0, a0, 2
+; RV32-NEXT: and a0, a0, t3
+; RV32-NEXT: add a0, a3, a0
+; RV32-NEXT: srli a3, a0, 4
+; RV32-NEXT: add a0, a0, a3
+; RV32-NEXT: and a0, a0, t2
+; RV32-NEXT: slli a3, a0, 8
+; RV32-NEXT: add a0, a0, a3
+; RV32-NEXT: slli a3, a0, 16
+; RV32-NEXT: add a0, a0, a3
+; RV32-NEXT: srli t6, a0, 24
+; RV32-NEXT: .LBB2_3: # %_udiv-special-cases
+; RV32-NEXT: lw a6, 4(a1)
+; RV32-NEXT: or s0, s11, ra
+; RV32-NEXT: bnez ra, .LBB2_5
+; RV32-NEXT: # %bb.4: # %_udiv-special-cases
+; RV32-NEXT: srli a0, s11, 1
+; RV32-NEXT: or a0, s11, a0
+; RV32-NEXT: srli a3, a0, 2
+; RV32-NEXT: or a0, a0, a3
+; RV32-NEXT: srli a3, a0, 4
+; RV32-NEXT: or a0, a0, a3
+; RV32-NEXT: srli a3, a0, 8
+; RV32-NEXT: or a0, a0, a3
+; RV32-NEXT: srli a3, a0, 16
+; RV32-NEXT: or a0, a0, a3
+; RV32-NEXT: not a0, a0
+; RV32-NEXT: srli a3, a0, 1
+; RV32-NEXT: and a3, a3, t4
+; RV32-NEXT: sub a0, a0, a3
+; RV32-NEXT: and a3, a0, t3
+; RV32-NEXT: srli a0, a0, 2
+; RV32-NEXT: and a0, a0, t3
+; RV32-NEXT: add a0, a3, a0
+; RV32-NEXT: srli a3, a0, 4
+; RV32-NEXT: add a0, a0, a3
+; RV32-NEXT: and a0, a0, t2
+; RV32-NEXT: slli a3, a0, 8
+; RV32-NEXT: add a0, a0, a3
+; RV32-NEXT: slli a3, a0, 16
+; RV32-NEXT: add a0, a0, a3
+; RV32-NEXT: srli a0, a0, 24
+; RV32-NEXT: addi t5, a0, 32
+; RV32-NEXT: j .LBB2_6
+; RV32-NEXT: .LBB2_5:
+; RV32-NEXT: srli a0, ra, 1
+; RV32-NEXT: or a0, ra, a0
+; RV32-NEXT: srli a3, a0, 2
+; RV32-NEXT: or a0, a0, a3
+; RV32-NEXT: srli a3, a0, 4
+; RV32-NEXT: or a0, a0, a3
+; RV32-NEXT: srli a3, a0, 8
+; RV32-NEXT: or a0, a0, a3
+; RV32-NEXT: srli a3, a0, 16
+; RV32-NEXT: or a0, a0, a3
+; RV32-NEXT: not a0, a0
+; RV32-NEXT: srli a3, a0, 1
+; RV32-NEXT: and a3, a3, t4
+; RV32-NEXT: sub a0, a0, a3
+; RV32-NEXT: and a3, a0, t3
+; RV32-NEXT: srli a0, a0, 2
+; RV32-NEXT: and a0, a0, t3
+; RV32-NEXT: add a0, a3, a0
+; RV32-NEXT: srli a3, a0, 4
+; RV32-NEXT: add a0, a0, a3
+; RV32-NEXT: and a0, a0, t2
+; RV32-NEXT: slli a3, a0, 8
+; RV32-NEXT: add a0, a0, a3
+; RV32-NEXT: slli a3, a0, 16
+; RV32-NEXT: add a0, a0, a3
+; RV32-NEXT: srli t5, a0, 24
+; RV32-NEXT: .LBB2_6: # %_udiv-special-cases
+; RV32-NEXT: lw a7, 12(a1)
+; RV32-NEXT: addi a0, t6, 64
+; RV32-NEXT: bnez s0, .LBB2_8
+; RV32-NEXT: # %bb.7: # %_udiv-special-cases
+; RV32-NEXT: mv t5, a0
+; RV32-NEXT: .LBB2_8: # %_udiv-special-cases
+; RV32-NEXT: lw t1, 0(a1)
+; RV32-NEXT: lw t0, 8(a1)
+; RV32-NEXT: snez s3, s0
+; RV32-NEXT: bnez a6, .LBB2_10
+; RV32-NEXT: # %bb.9: # %_udiv-special-cases
+; RV32-NEXT: srli a1, t1, 1
+; RV32-NEXT: or a1, t1, a1
+; RV32-NEXT: srli a3, a1, 2
+; RV32-NEXT: or a1, a1, a3
+; RV32-NEXT: srli a3, a1, 4
+; RV32-NEXT: or a1, a1, a3
+; RV32-NEXT: srli a3, a1, 8
+; RV32-NEXT: or a1, a1, a3
+; RV32-NEXT: srli a3, a1, 16
+; RV32-NEXT: or a1, a1, a3
+; RV32-NEXT: not a1, a1
+; RV32-NEXT: srli a3, a1, 1
+; RV32-NEXT: and a3, a3, t4
+; RV32-NEXT: sub a1, a1, a3
+; RV32-NEXT: and a3, a1, t3
+; RV32-NEXT: srli a1, a1, 2
+; RV32-NEXT: and a1, a1, t3
+; RV32-NEXT: add a1, a3, a1
+; RV32-NEXT: srli a3, a1, 4
+; RV32-NEXT: add a1, a1, a3
+; RV32-NEXT: and a1, a1, t2
+; RV32-NEXT: slli a3, a1, 8
+; RV32-NEXT: add a1, a1, a3
+; RV32-NEXT: slli a3, a1, 16
+; RV32-NEXT: add a1, a1, a3
+; RV32-NEXT: srli a1, a1, 24
+; RV32-NEXT: addi a3, a1, 32
+; RV32-NEXT: j .LBB2_11
+; RV32-NEXT: .LBB2_10:
+; RV32-NEXT: srli a1, a6, 1
+; RV32-NEXT: or a1, a6, a1
+; RV32-NEXT: srli a3, a1, 2
+; RV32-NEXT: or a1, a1, a3
+; RV32-NEXT: srli a3, a1, 4
+; RV32-NEXT: or a1, a1, a3
+; RV32-NEXT: srli a3, a1, 8
+; RV32-NEXT: or a1, a1, a3
+; RV32-NEXT: srli a3, a1, 16
+; RV32-NEXT: or a1, a1, a3
+; RV32-NEXT: not a1, a1
+; RV32-NEXT: srli a3, a1, 1
+; RV32-NEXT: and a3, a3, t4
+; RV32-NEXT: sub a1, a1, a3
+; RV32-NEXT: and a3, a1, t3
+; RV32-NEXT: srli a1, a1, 2
+; RV32-NEXT: and a1, a1, t3
+; RV32-NEXT: add a1, a3, a1
+; RV32-NEXT: srli a3, a1, 4
+; RV32-NEXT: add a1, a1, a3
+; RV32-NEXT: and a1, a1, t2
+; RV32-NEXT: slli a3, a1, 8
+; RV32-NEXT: add a1, a1, a3
+; RV32-NEXT: slli a3, a1, 16
+; RV32-NEXT: add a1, a1, a3
+; RV32-NEXT: srli a3, a1, 24
+; RV32-NEXT: .LBB2_11: # %_udiv-special-cases
+; RV32-NEXT: or a1, s9, ra
+; RV32-NEXT: or s0, s8, s11
+; RV32-NEXT: or s1, a6, a7
+; RV32-NEXT: or s2, t1, t0
+; RV32-NEXT: sltu t6, a0, t6
+; RV32-NEXT: addi s3, s3, -1
+; RV32-NEXT: addi a0, a3, 64
+; RV32-NEXT: or s4, t0, a7
+; RV32-NEXT: sltu s5, a0, a3
+; RV32-NEXT: snez s6, s4
+; RV32-NEXT: addi s6, s6, -1
+; RV32-NEXT: bnez a7, .LBB2_13
+; RV32-NEXT: # %bb.12: # %_udiv-special-cases
+; RV32-NEXT: srli a3, t0, 1
+; RV32-NEXT: or a3, t0, a3
+; RV32-NEXT: srli a4, a3, 2
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: srli a4, a3, 4
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: srli a4, a3, 8
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: srli a4, a3, 16
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: not a3, a3
+; RV32-NEXT: srli a4, a3, 1
+; RV32-NEXT: and a4, a4, t4
+; RV32-NEXT: sub a3, a3, a4
+; RV32-NEXT: and a4, a3, t3
+; RV32-NEXT: srli a3, a3, 2
+; RV32-NEXT: and a3, a3, t3
+; RV32-NEXT: add a3, a4, a3
+; RV32-NEXT: srli a4, a3, 4
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: and a3, a3, t2
+; RV32-NEXT: slli a4, a3, 8
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: slli a4, a3, 16
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: srli a3, a3, 24
+; RV32-NEXT: addi a3, a3, 32
+; RV32-NEXT: j .LBB2_14
+; RV32-NEXT: .LBB2_13:
+; RV32-NEXT: srli a3, a7, 1
+; RV32-NEXT: or a3, a7, a3
+; RV32-NEXT: srli a4, a3, 2
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: srli a4, a3, 4
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: srli a4, a3, 8
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: srli a4, a3, 16
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: not a3, a3
+; RV32-NEXT: srli a4, a3, 1
+; RV32-NEXT: and a4, a4, t4
+; RV32-NEXT: sub a3, a3, a4
+; RV32-NEXT: and a4, a3, t3
+; RV32-NEXT: srli a3, a3, 2
+; RV32-NEXT: and a3, a3, t3
+; RV32-NEXT: add a3, a4, a3
+; RV32-NEXT: srli a4, a3, 4
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: and a3, a3, t2
+; RV32-NEXT: slli a4, a3, 8
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: slli a4, a3, 16
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: srli a3, a3, 24
+; RV32-NEXT: .LBB2_14: # %_udiv-special-cases
+; RV32-NEXT: or s0, s0, a1
+; RV32-NEXT: or a5, s2, s1
+; RV32-NEXT: and a1, s3, t6
+; RV32-NEXT: and a4, s6, s5
+; RV32-NEXT: bnez s4, .LBB2_16
+; RV32-NEXT: # %bb.15: # %_udiv-special-cases
+; RV32-NEXT: mv a3, a0
+; RV32-NEXT: .LBB2_16: # %_udiv-special-cases
+; RV32-NEXT: seqz a0, s0
+; RV32-NEXT: seqz a5, a5
+; RV32-NEXT: sltu t2, t5, a3
+; RV32-NEXT: sub t4, a1, a4
+; RV32-NEXT: mv t3, t2
+; RV32-NEXT: beq a1, a4, .LBB2_18
+; RV32-NEXT: # %bb.17: # %_udiv-special-cases
+; RV32-NEXT: sltu t3, a1, a4
+; RV32-NEXT: .LBB2_18: # %_udiv-special-cases
+; RV32-NEXT: sub t2, t4, t2
+; RV32-NEXT: or a0, a0, a5
+; RV32-NEXT: neg t4, t3
+; RV32-NEXT: seqz t6, t3
+; RV32-NEXT: addi t6, t6, -1
+; RV32-NEXT: or a1, t4, t6
+; RV32-NEXT: sub t3, t5, a3
+; RV32-NEXT: beqz a1, .LBB2_20
+; RV32-NEXT: # %bb.19: # %_udiv-special-cases
+; RV32-NEXT: snez a1, a1
+; RV32-NEXT: j .LBB2_21
+; RV32-NEXT: .LBB2_20:
+; RV32-NEXT: snez a1, t2
+; RV32-NEXT: sltiu a3, t3, 128
+; RV32-NEXT: xori a3, a3, 1
+; RV32-NEXT: or a1, a3, a1
+; RV32-NEXT: .LBB2_21: # %_udiv-special-cases
+; RV32-NEXT: or a5, a0, a1
+; RV32-NEXT: addi a3, a5, -1
+; RV32-NEXT: and a0, a3, a7
+; RV32-NEXT: and a1, a3, t0
+; RV32-NEXT: and a4, a3, a6
+; RV32-NEXT: and a3, a3, t1
+; RV32-NEXT: bnez a5, .LBB2_26
+; RV32-NEXT: # %bb.22: # %_udiv-special-cases
+; RV32-NEXT: xori a5, t3, 127
+; RV32-NEXT: or a5, a5, t4
+; RV32-NEXT: or t5, t2, t6
+; RV32-NEXT: or a5, a5, t5
+; RV32-NEXT: beqz a5, .LBB2_26
+; RV32-NEXT: # %bb.23: # %udiv-bb1
+; RV32-NEXT: sw s7, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: addi a1, t3, 1
+; RV32-NEXT: sw zero, 72(sp)
+; RV32-NEXT: sw zero, 76(sp)
+; RV32-NEXT: sw zero, 80(sp)
+; RV32-NEXT: sw zero, 84(sp)
+; RV32-NEXT: sw t1, 88(sp)
+; RV32-NEXT: sw a6, 92(sp)
+; RV32-NEXT: sw t0, 96(sp)
+; RV32-NEXT: sw a7, 100(sp)
+; RV32-NEXT: li a0, 127
+; RV32-NEXT: addi a2, sp, 88
+; RV32-NEXT: seqz a3, a1
+; RV32-NEXT: sub a0, a0, t3
+; RV32-NEXT: add t2, t2, a3
+; RV32-NEXT: andi a3, a0, 31
+; RV32-NEXT: srli a0, a0, 3
+; RV32-NEXT: or a4, a1, t2
+; RV32-NEXT: xori a3, a3, 31
+; RV32-NEXT: andi a0, a0, 12
+; RV32-NEXT: seqz t5, a4
+; RV32-NEXT: sub a2, a2, a0
+; RV32-NEXT: add t5, t4, t5
+; RV32-NEXT: lw a0, 0(a2)
+; RV32-NEXT: lw a4, 4(a2)
+; RV32-NEXT: lw a5, 8(a2)
+; RV32-NEXT: lw a2, 12(a2)
+; RV32-NEXT: sltu t4, t5, t4
+; RV32-NEXT: or s0, a1, t5
+; RV32-NEXT: add t4, t6, t4
+; RV32-NEXT: or t6, t2, t4
+; RV32-NEXT: or s0, s0, t6
+; RV32-NEXT: srli t6, a5, 1
+; RV32-NEXT: srli s1, a4, 1
+; RV32-NEXT: srli s2, a0, 1
+; RV32-NEXT: srl t6, t6, a3
+; RV32-NEXT: srl s1, s1, a3
+; RV32-NEXT: srl a3, s2, a3
+; RV32-NEXT: not t3, t3
+; RV32-NEXT: sll a2, a2, t3
+; RV32-NEXT: or s2, a2, t6
+; RV32-NEXT: sll a2, a5, t3
+; RV32-NEXT: sll a4, a4, t3
+; RV32-NEXT: or s1, a2, s1
+; RV32-NEXT: or t6, a4, a3
+; RV32-NEXT: sll t3, a0, t3
+; RV32-NEXT: bnez s0, .LBB2_27
+; RV32-NEXT: # %bb.24:
+; RV32-NEXT: li s6, 0
+; RV32-NEXT: li s7, 0
+; RV32-NEXT: li s8, 0
+; RV32-NEXT: .LBB2_25: # %udiv-loop-exit
+; RV32-NEXT: srli a0, s1, 31
+; RV32-NEXT: slli s2, s2, 1
+; RV32-NEXT: or a0, s2, a0
+; RV32-NEXT: srli a1, t6, 31
+; RV32-NEXT: slli s1, s1, 1
+; RV32-NEXT: or a1, s1, a1
+; RV32-NEXT: srli a2, t3, 31
+; RV32-NEXT: slli t6, t6, 1
+; RV32-NEXT: slli a3, t3, 1
+; RV32-NEXT: or a3, s0, a3
+; RV32-NEXT: or a2, s6, a2
+; RV32-NEXT: or a4, a2, t6
+; RV32-NEXT: or a1, s7, a1
+; RV32-NEXT: or a0, s8, a0
+; RV32-NEXT: lw s7, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT: .LBB2_26: # %udiv-end
+; RV32-NEXT: sw a3, 0(s7)
+; RV32-NEXT: sw a4, 4(s7)
+; RV32-NEXT: sw a1, 8(s7)
+; RV32-NEXT: sw a0, 12(s7)
+; RV32-NEXT: lw ra, 156(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 152(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s1, 148(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s2, 144(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s3, 140(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s4, 136(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s5, 132(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s6, 128(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s7, 124(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s8, 120(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s9, 116(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s10, 112(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s11, 108(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 160
+; RV32-NEXT: ret
+; RV32-NEXT: .LBB2_27: # %udiv-preheader
+; RV32-NEXT: li s0, 0
+; RV32-NEXT: li s5, 0
+; RV32-NEXT: li s3, 0
+; RV32-NEXT: li s4, 0
+; RV32-NEXT: sw zero, 56(sp)
+; RV32-NEXT: sw zero, 60(sp)
+; RV32-NEXT: sw zero, 64(sp)
+; RV32-NEXT: sw zero, 68(sp)
+; RV32-NEXT: sw t1, 40(sp)
+; RV32-NEXT: sw a6, 44(sp)
+; RV32-NEXT: sw t0, 48(sp)
+; RV32-NEXT: sw a7, 52(sp)
+; RV32-NEXT: srli a0, a1, 3
+; RV32-NEXT: addi a2, sp, 40
+; RV32-NEXT: andi a0, a0, 12
+; RV32-NEXT: add a0, a2, a0
+; RV32-NEXT: lw a2, 4(a0)
+; RV32-NEXT: lw a3, 8(a0)
+; RV32-NEXT: lw a4, 12(a0)
+; RV32-NEXT: lw a0, 0(a0)
+; RV32-NEXT: andi a5, a1, 31
+; RV32-NEXT: xori a5, a5, 31
+; RV32-NEXT: slli a6, a4, 1
+; RV32-NEXT: slli a7, a3, 1
+; RV32-NEXT: slli t0, a2, 1
+; RV32-NEXT: sll a6, a6, a5
+; RV32-NEXT: sll a7, a7, a5
+; RV32-NEXT: sll a5, t0, a5
+; RV32-NEXT: seqz t0, s8
+; RV32-NEXT: srl a3, a3, a1
+; RV32-NEXT: or s10, a3, a6
+; RV32-NEXT: or a3, s8, s9
+; RV32-NEXT: sw s9, 32(sp) # 4-byte Folded Spill
+; RV32-NEXT: sub a6, s9, t0
+; RV32-NEXT: seqz a3, a3
+; RV32-NEXT: srl a2, a2, a1
+; RV32-NEXT: or s9, a2, a7
+; RV32-NEXT: sub a7, s11, a3
+; RV32-NEXT: sw s11, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT: sltu a2, s11, a3
+; RV32-NEXT: sw ra, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT: sub a2, ra, a2
+; RV32-NEXT: sw a2, 20(sp) # 4-byte Folded Spill
+; RV32-NEXT: srl a0, a0, a1
+; RV32-NEXT: srl ra, a4, a1
+; RV32-NEXT: or t1, a0, a5
+; RV32-NEXT: sw s8, 36(sp) # 4-byte Folded Spill
+; RV32-NEXT: addi s8, s8, -1
+; RV32-NEXT: sw s8, 16(sp) # 4-byte Folded Spill
+; RV32-NEXT: li s7, 0
+; RV32-NEXT: li s8, 0
+; RV32-NEXT: j .LBB2_29
+; RV32-NEXT: .LBB2_28: # %udiv-do-while
+; RV32-NEXT: # in Loop: Header=BB2_29 Depth=1
+; RV32-NEXT: li s6, 0
+; RV32-NEXT: sub a0, a0, a5
+; RV32-NEXT: srli a5, s1, 31
+; RV32-NEXT: slli s2, s2, 1
+; RV32-NEXT: or a5, s2, a5
+; RV32-NEXT: srli s2, t6, 31
+; RV32-NEXT: slli s1, s1, 1
+; RV32-NEXT: or s1, s1, s2
+; RV32-NEXT: srli s2, t3, 31
+; RV32-NEXT: slli t6, t6, 1
+; RV32-NEXT: slli t3, t3, 1
+; RV32-NEXT: or t6, t6, s2
+; RV32-NEXT: lw a2, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT: and s2, s10, a2
+; RV32-NEXT: or t3, s0, t3
+; RV32-NEXT: sub a2, a3, s2
+; RV32-NEXT: sltu a3, a3, s2
+; RV32-NEXT: lw t0, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT: and s0, s10, t0
+; RV32-NEXT: sub t0, s9, s0
+; RV32-NEXT: or s2, a1, t2
+; RV32-NEXT: sub s9, a0, a4
+; RV32-NEXT: seqz a0, a1
+; RV32-NEXT: sub t2, t2, a0
+; RV32-NEXT: or t6, s5, t6
+; RV32-NEXT: addi a1, a1, -1
+; RV32-NEXT: andi s0, s10, 1
+; RV32-NEXT: seqz a0, s2
+; RV32-NEXT: or s1, s3, s1
+; RV32-NEXT: or s2, s4, a5
+; RV32-NEXT: sub s10, a2, ra
+; RV32-NEXT: sltu a2, a2, ra
+; RV32-NEXT: sub a3, t0, a3
+; RV32-NEXT: sltu a4, t5, a0
+; RV32-NEXT: sub t5, t5, a0
+; RV32-NEXT: sub ra, a3, a2
+; RV32-NEXT: sub t4, t4, a4
+; RV32-NEXT: or a0, t2, t4
+; RV32-NEXT: or a2, a1, t5
+; RV32-NEXT: or a0, a2, a0
+; RV32-NEXT: sub t1, s11, t1
+; RV32-NEXT: li s5, 0
+; RV32-NEXT: li s3, 0
+; RV32-NEXT: li s4, 0
+; RV32-NEXT: beqz a0, .LBB2_25
+; RV32-NEXT: .LBB2_29: # %udiv-do-while
+; RV32-NEXT: # =>This Inner Loop Header: Depth=1
+; RV32-NEXT: srli a0, t1, 31
+; RV32-NEXT: slli a3, s9, 1
+; RV32-NEXT: slli t1, t1, 1
+; RV32-NEXT: or a0, a3, a0
+; RV32-NEXT: srli a3, s2, 31
+; RV32-NEXT: or s11, t1, a3
+; RV32-NEXT: beq a6, a0, .LBB2_31
+; RV32-NEXT: # %bb.30: # %udiv-do-while
+; RV32-NEXT: # in Loop: Header=BB2_29 Depth=1
+; RV32-NEXT: sltu a4, a6, a0
+; RV32-NEXT: j .LBB2_32
+; RV32-NEXT: .LBB2_31: # in Loop: Header=BB2_29 Depth=1
+; RV32-NEXT: lw a2, 16(sp) # 4-byte Folded Reload
+; RV32-NEXT: sltu a4, a2, s11
+; RV32-NEXT: .LBB2_32: # %udiv-do-while
+; RV32-NEXT: # in Loop: Header=BB2_29 Depth=1
+; RV32-NEXT: lw a2, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT: srli a3, s10, 31
+; RV32-NEXT: slli ra, ra, 1
+; RV32-NEXT: srli a5, s9, 31
+; RV32-NEXT: slli s10, s10, 1
+; RV32-NEXT: or s9, ra, a3
+; RV32-NEXT: or a3, s10, a5
+; RV32-NEXT: sub a5, a7, a3
+; RV32-NEXT: sltu t1, a7, a3
+; RV32-NEXT: lw t0, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT: sub s6, t0, s9
+; RV32-NEXT: sltu a4, a5, a4
+; RV32-NEXT: sub a5, s6, t1
+; RV32-NEXT: sub a5, a5, a4
+; RV32-NEXT: srai s10, a5, 31
+; RV32-NEXT: and t1, s10, a2
+; RV32-NEXT: lw a2, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT: and a5, s10, a2
+; RV32-NEXT: sltu a4, s11, t1
+; RV32-NEXT: mv ra, a4
+; RV32-NEXT: beq a0, a5, .LBB2_28
+; RV32-NEXT: # %bb.33: # %udiv-do-while
+; RV32-NEXT: # in Loop: Header=BB2_29 Depth=1
+; RV32-NEXT: sltu ra, a0, a5
+; RV32-NEXT: j .LBB2_28
+;
+; RV64-LABEL: udiv_i128:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT: call __udivti3
+; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 16
+; RV64-NEXT: ret
%res = udiv i128 %x, %y
ret i128 %res
}
define i129 @udiv_i129(i129 %x, i129 %y) nounwind {
-; CHECK-LABEL: udiv_i129:
-; CHECK-NOT: call{{.*}}div
+; RV32-LABEL: udiv_i129:
+; RV32: # %bb.0: # %_udiv-special-cases
+; RV32-NEXT: addi sp, sp, -240
+; RV32-NEXT: sw ra, 236(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 232(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s1, 228(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s2, 224(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s3, 220(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s4, 216(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s5, 212(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s6, 208(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s7, 204(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s8, 200(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s9, 196(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s10, 192(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s11, 188(sp) # 4-byte Folded Spill
+; RV32-NEXT: mv ra, a0
+; RV32-NEXT: lw t2, 16(a2)
+; RV32-NEXT: lw a4, 0(a2)
+; RV32-NEXT: lw a5, 4(a2)
+; RV32-NEXT: lw a6, 8(a2)
+; RV32-NEXT: lw a0, 12(a2)
+; RV32-NEXT: sw a0, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT: lui a0, 349525
+; RV32-NEXT: lui a2, 209715
+; RV32-NEXT: lui a3, 61681
+; RV32-NEXT: addi t5, a0, 1365
+; RV32-NEXT: addi t4, a2, 819
+; RV32-NEXT: addi t3, a3, -241
+; RV32-NEXT: sw a6, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT: slli a0, a6, 31
+; RV32-NEXT: srli a2, a5, 1
+; RV32-NEXT: sw a5, 20(sp) # 4-byte Folded Spill
+; RV32-NEXT: slli a3, a5, 31
+; RV32-NEXT: or a0, a2, a0
+; RV32-NEXT: sw a4, 32(sp) # 4-byte Folded Spill
+; RV32-NEXT: srli a2, a4, 1
+; RV32-NEXT: or a2, a2, a3
+; RV32-NEXT: bnez a0, .LBB3_2
+; RV32-NEXT: # %bb.1: # %_udiv-special-cases
+; RV32-NEXT: srli a3, a2, 1
+; RV32-NEXT: or a3, a2, a3
+; RV32-NEXT: srli a4, a3, 2
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: srli a4, a3, 4
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: srli a4, a3, 8
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: srli a4, a3, 16
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: not a3, a3
+; RV32-NEXT: srli a4, a3, 1
+; RV32-NEXT: and a4, a4, t5
+; RV32-NEXT: sub a3, a3, a4
+; RV32-NEXT: and a4, a3, t4
+; RV32-NEXT: srli a3, a3, 2
+; RV32-NEXT: and a3, a3, t4
+; RV32-NEXT: add a3, a4, a3
+; RV32-NEXT: srli a4, a3, 4
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: and a3, a3, t3
+; RV32-NEXT: slli a4, a3, 8
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: slli a4, a3, 16
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: srli a3, a3, 24
+; RV32-NEXT: addi a6, a3, 32
+; RV32-NEXT: j .LBB3_3
+; RV32-NEXT: .LBB3_2:
+; RV32-NEXT: srli a3, a0, 1
+; RV32-NEXT: or a3, a0, a3
+; RV32-NEXT: srli a4, a3, 2
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: srli a4, a3, 4
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: srli a4, a3, 8
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: srli a4, a3, 16
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: not a3, a3
+; RV32-NEXT: srli a4, a3, 1
+; RV32-NEXT: and a4, a4, t5
+; RV32-NEXT: sub a3, a3, a4
+; RV32-NEXT: and a4, a3, t4
+; RV32-NEXT: srli a3, a3, 2
+; RV32-NEXT: and a3, a3, t4
+; RV32-NEXT: add a3, a4, a3
+; RV32-NEXT: srli a4, a3, 4
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: and a3, a3, t3
+; RV32-NEXT: slli a4, a3, 8
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: slli a4, a3, 16
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: srli a6, a3, 24
+; RV32-NEXT: .LBB3_3: # %_udiv-special-cases
+; RV32-NEXT: lw a7, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT: srli a3, a7, 1
+; RV32-NEXT: slli a5, t2, 31
+; RV32-NEXT: slli a7, a7, 31
+; RV32-NEXT: lw a4, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT: srli t0, a4, 1
+; RV32-NEXT: lw a4, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT: slli a4, a4, 31
+; RV32-NEXT: li s2, 64
+; RV32-NEXT: bnez a4, .LBB3_5
+; RV32-NEXT: # %bb.4: # %_udiv-special-cases
+; RV32-NEXT: li t6, 64
+; RV32-NEXT: j .LBB3_6
+; RV32-NEXT: .LBB3_5:
+; RV32-NEXT: srli t1, a4, 1
+; RV32-NEXT: or t1, a4, t1
+; RV32-NEXT: srli t6, t1, 2
+; RV32-NEXT: or t1, t1, t6
+; RV32-NEXT: srli t6, t1, 4
+; RV32-NEXT: or t1, t1, t6
+; RV32-NEXT: srli t6, t1, 8
+; RV32-NEXT: or t1, t1, t6
+; RV32-NEXT: srli t6, t1, 16
+; RV32-NEXT: or t1, t1, t6
+; RV32-NEXT: not t1, t1
+; RV32-NEXT: srli t6, t1, 1
+; RV32-NEXT: and t6, t6, t5
+; RV32-NEXT: sub t1, t1, t6
+; RV32-NEXT: and t6, t1, t4
+; RV32-NEXT: srli t1, t1, 2
+; RV32-NEXT: and t1, t1, t4
+; RV32-NEXT: add t1, t6, t1
+; RV32-NEXT: srli t6, t1, 4
+; RV32-NEXT: add t1, t1, t6
+; RV32-NEXT: and t1, t1, t3
+; RV32-NEXT: slli t6, t1, 8
+; RV32-NEXT: add t1, t1, t6
+; RV32-NEXT: slli t6, t1, 16
+; RV32-NEXT: add t1, t1, t6
+; RV32-NEXT: srli t6, t1, 24
+; RV32-NEXT: .LBB3_6: # %_udiv-special-cases
+; RV32-NEXT: or t1, a5, a3
+; RV32-NEXT: or a7, t0, a7
+; RV32-NEXT: bnez a4, .LBB3_8
+; RV32-NEXT: # %bb.7: # %_udiv-special-cases
+; RV32-NEXT: li t6, 128
+; RV32-NEXT: .LBB3_8: # %_udiv-special-cases
+; RV32-NEXT: or a5, a7, t1
+; RV32-NEXT: addi a4, a6, 64
+; RV32-NEXT: addi a3, t6, 128
+; RV32-NEXT: or a0, a0, t1
+; RV32-NEXT: or a2, a2, a7
+; RV32-NEXT: or s3, a2, a0
+; RV32-NEXT: sltu s0, a3, t6
+; RV32-NEXT: bnez s3, .LBB3_11
+; RV32-NEXT: # %bb.9: # %_udiv-special-cases
+; RV32-NEXT: mv t6, s0
+; RV32-NEXT: beqz t1, .LBB3_12
+; RV32-NEXT: .LBB3_10:
+; RV32-NEXT: srli a0, t1, 1
+; RV32-NEXT: or a0, t1, a0
+; RV32-NEXT: srli a2, a0, 2
+; RV32-NEXT: or a0, a0, a2
+; RV32-NEXT: srli a2, a0, 4
+; RV32-NEXT: or a0, a0, a2
+; RV32-NEXT: srli a2, a0, 8
+; RV32-NEXT: or a0, a0, a2
+; RV32-NEXT: srli a2, a0, 16
+; RV32-NEXT: or a0, a0, a2
+; RV32-NEXT: not a0, a0
+; RV32-NEXT: srli a2, a0, 1
+; RV32-NEXT: and a2, a2, t5
+; RV32-NEXT: sub a0, a0, a2
+; RV32-NEXT: and a2, a0, t4
+; RV32-NEXT: srli a0, a0, 2
+; RV32-NEXT: and a0, a0, t4
+; RV32-NEXT: add a0, a2, a0
+; RV32-NEXT: srli a2, a0, 4
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: and a0, a0, t3
+; RV32-NEXT: slli a2, a0, 8
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: slli a2, a0, 16
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: srli s1, a0, 24
+; RV32-NEXT: beqz a5, .LBB3_13
+; RV32-NEXT: j .LBB3_14
+; RV32-NEXT: .LBB3_11:
+; RV32-NEXT: snez a0, a5
+; RV32-NEXT: sltu a2, a4, a6
+; RV32-NEXT: addi a0, a0, -1
+; RV32-NEXT: and t6, a0, a2
+; RV32-NEXT: bnez t1, .LBB3_10
+; RV32-NEXT: .LBB3_12: # %_udiv-special-cases
+; RV32-NEXT: srli a0, a7, 1
+; RV32-NEXT: or a0, a7, a0
+; RV32-NEXT: srli a2, a0, 2
+; RV32-NEXT: or a0, a0, a2
+; RV32-NEXT: srli a2, a0, 4
+; RV32-NEXT: or a0, a0, a2
+; RV32-NEXT: srli a2, a0, 8
+; RV32-NEXT: or a0, a0, a2
+; RV32-NEXT: srli a2, a0, 16
+; RV32-NEXT: or a0, a0, a2
+; RV32-NEXT: not a0, a0
+; RV32-NEXT: srli a2, a0, 1
+; RV32-NEXT: and a2, a2, t5
+; RV32-NEXT: sub a0, a0, a2
+; RV32-NEXT: and a2, a0, t4
+; RV32-NEXT: srli a0, a0, 2
+; RV32-NEXT: and a0, a0, t4
+; RV32-NEXT: add a0, a2, a0
+; RV32-NEXT: srli a2, a0, 4
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: and a0, a0, t3
+; RV32-NEXT: slli a2, a0, 8
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: slli a2, a0, 16
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: srli a0, a0, 24
+; RV32-NEXT: addi s1, a0, 32
+; RV32-NEXT: bnez a5, .LBB3_14
+; RV32-NEXT: .LBB3_13: # %_udiv-special-cases
+; RV32-NEXT: mv s1, a4
+; RV32-NEXT: .LBB3_14: # %_udiv-special-cases
+; RV32-NEXT: lw a7, 0(a1)
+; RV32-NEXT: lw t0, 4(a1)
+; RV32-NEXT: lw a6, 8(a1)
+; RV32-NEXT: bnez s3, .LBB3_16
+; RV32-NEXT: # %bb.15: # %_udiv-special-cases
+; RV32-NEXT: mv s1, a3
+; RV32-NEXT: .LBB3_16: # %_udiv-special-cases
+; RV32-NEXT: lw t1, 12(a1)
+; RV32-NEXT: lw a1, 16(a1)
+; RV32-NEXT: slli a0, a6, 31
+; RV32-NEXT: srli a2, t0, 1
+; RV32-NEXT: or a0, a2, a0
+; RV32-NEXT: slli a2, t0, 31
+; RV32-NEXT: srli a3, a7, 1
+; RV32-NEXT: or a2, a3, a2
+; RV32-NEXT: bnez a0, .LBB3_18
+; RV32-NEXT: # %bb.17: # %_udiv-special-cases
+; RV32-NEXT: srli a3, a2, 1
+; RV32-NEXT: or a3, a2, a3
+; RV32-NEXT: srli a4, a3, 2
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: srli a4, a3, 4
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: srli a4, a3, 8
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: srli a4, a3, 16
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: not a3, a3
+; RV32-NEXT: srli a4, a3, 1
+; RV32-NEXT: and a4, a4, t5
+; RV32-NEXT: sub a3, a3, a4
+; RV32-NEXT: and a4, a3, t4
+; RV32-NEXT: srli a3, a3, 2
+; RV32-NEXT: and a3, a3, t4
+; RV32-NEXT: add a3, a4, a3
+; RV32-NEXT: srli a4, a3, 4
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: and a3, a3, t3
+; RV32-NEXT: slli a4, a3, 8
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: slli a4, a3, 16
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: srli a3, a3, 24
+; RV32-NEXT: addi s5, a3, 32
+; RV32-NEXT: j .LBB3_19
+; RV32-NEXT: .LBB3_18:
+; RV32-NEXT: srli a3, a0, 1
+; RV32-NEXT: or a3, a0, a3
+; RV32-NEXT: srli a4, a3, 2
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: srli a4, a3, 4
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: srli a4, a3, 8
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: srli a4, a3, 16
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: not a3, a3
+; RV32-NEXT: srli a4, a3, 1
+; RV32-NEXT: and a4, a4, t5
+; RV32-NEXT: sub a3, a3, a4
+; RV32-NEXT: and a4, a3, t4
+; RV32-NEXT: srli a3, a3, 2
+; RV32-NEXT: and a3, a3, t4
+; RV32-NEXT: add a3, a4, a3
+; RV32-NEXT: srli a4, a3, 4
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: and a3, a3, t3
+; RV32-NEXT: slli a4, a3, 8
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: slli a4, a3, 16
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: srli s5, a3, 24
+; RV32-NEXT: .LBB3_19: # %_udiv-special-cases
+; RV32-NEXT: srli a3, t1, 1
+; RV32-NEXT: slli a4, a1, 31
+; RV32-NEXT: slli a5, t1, 31
+; RV32-NEXT: slli s4, a7, 31
+; RV32-NEXT: srli s6, a6, 1
+; RV32-NEXT: beqz s4, .LBB3_21
+; RV32-NEXT: # %bb.20:
+; RV32-NEXT: srli s2, s4, 1
+; RV32-NEXT: or s2, s4, s2
+; RV32-NEXT: srli s7, s2, 2
+; RV32-NEXT: or s2, s2, s7
+; RV32-NEXT: srli s7, s2, 4
+; RV32-NEXT: or s2, s2, s7
+; RV32-NEXT: srli s7, s2, 8
+; RV32-NEXT: or s2, s2, s7
+; RV32-NEXT: srli s7, s2, 16
+; RV32-NEXT: or s2, s2, s7
+; RV32-NEXT: not s2, s2
+; RV32-NEXT: srli s7, s2, 1
+; RV32-NEXT: and s7, s7, t5
+; RV32-NEXT: sub s2, s2, s7
+; RV32-NEXT: and s7, s2, t4
+; RV32-NEXT: srli s2, s2, 2
+; RV32-NEXT: and s2, s2, t4
+; RV32-NEXT: add s2, s7, s2
+; RV32-NEXT: srli s7, s2, 4
+; RV32-NEXT: add s2, s2, s7
+; RV32-NEXT: and s2, s2, t3
+; RV32-NEXT: slli s7, s2, 8
+; RV32-NEXT: add s2, s2, s7
+; RV32-NEXT: slli s7, s2, 16
+; RV32-NEXT: add s2, s2, s7
+; RV32-NEXT: srli s2, s2, 24
+; RV32-NEXT: .LBB3_21: # %_udiv-special-cases
+; RV32-NEXT: or s7, a4, a3
+; RV32-NEXT: or s6, s6, a5
+; RV32-NEXT: bnez s4, .LBB3_23
+; RV32-NEXT: # %bb.22: # %_udiv-special-cases
+; RV32-NEXT: li s2, 128
+; RV32-NEXT: .LBB3_23: # %_udiv-special-cases
+; RV32-NEXT: or s4, s6, s7
+; RV32-NEXT: addi a5, s5, 64
+; RV32-NEXT: addi a3, s2, 128
+; RV32-NEXT: or a0, a0, s7
+; RV32-NEXT: or a4, a2, s6
+; RV32-NEXT: or a4, a4, a0
+; RV32-NEXT: sltu a0, a3, s2
+; RV32-NEXT: bnez a4, .LBB3_26
+; RV32-NEXT: # %bb.24: # %_udiv-special-cases
+; RV32-NEXT: mv a2, a0
+; RV32-NEXT: snez s2, s3
+; RV32-NEXT: beqz s7, .LBB3_27
+; RV32-NEXT: .LBB3_25:
+; RV32-NEXT: srli s3, s7, 1
+; RV32-NEXT: or s3, s7, s3
+; RV32-NEXT: srli s5, s3, 2
+; RV32-NEXT: or s3, s3, s5
+; RV32-NEXT: srli s5, s3, 4
+; RV32-NEXT: or s3, s3, s5
+; RV32-NEXT: srli s5, s3, 8
+; RV32-NEXT: or s3, s3, s5
+; RV32-NEXT: srli s5, s3, 16
+; RV32-NEXT: or s3, s3, s5
+; RV32-NEXT: not s3, s3
+; RV32-NEXT: srli s5, s3, 1
+; RV32-NEXT: and t5, s5, t5
+; RV32-NEXT: sub t5, s3, t5
+; RV32-NEXT: and s3, t5, t4
+; RV32-NEXT: srli t5, t5, 2
+; RV32-NEXT: and t4, t5, t4
+; RV32-NEXT: add t4, s3, t4
+; RV32-NEXT: srli t5, t4, 4
+; RV32-NEXT: add t4, t4, t5
+; RV32-NEXT: and t3, t4, t3
+; RV32-NEXT: slli t4, t3, 8
+; RV32-NEXT: add t3, t3, t4
+; RV32-NEXT: slli t4, t3, 16
+; RV32-NEXT: add t3, t3, t4
+; RV32-NEXT: srli t3, t3, 24
+; RV32-NEXT: j .LBB3_28
+; RV32-NEXT: .LBB3_26:
+; RV32-NEXT: snez a2, s4
+; RV32-NEXT: sltu s2, a5, s5
+; RV32-NEXT: addi a2, a2, -1
+; RV32-NEXT: and a2, a2, s2
+; RV32-NEXT: snez s2, s3
+; RV32-NEXT: bnez s7, .LBB3_25
+; RV32-NEXT: .LBB3_27: # %_udiv-special-cases
+; RV32-NEXT: srli s3, s6, 1
+; RV32-NEXT: or s3, s6, s3
+; RV32-NEXT: srli s5, s3, 2
+; RV32-NEXT: or s3, s3, s5
+; RV32-NEXT: srli s5, s3, 4
+; RV32-NEXT: or s3, s3, s5
+; RV32-NEXT: srli s5, s3, 8
+; RV32-NEXT: or s3, s3, s5
+; RV32-NEXT: srli s5, s3, 16
+; RV32-NEXT: or s3, s3, s5
+; RV32-NEXT: not s3, s3
+; RV32-NEXT: srli s5, s3, 1
+; RV32-NEXT: and t5, s5, t5
+; RV32-NEXT: sub t5, s3, t5
+; RV32-NEXT: and s3, t5, t4
+; RV32-NEXT: srli t5, t5, 2
+; RV32-NEXT: and t4, t5, t4
+; RV32-NEXT: add t4, s3, t4
+; RV32-NEXT: srli t5, t4, 4
+; RV32-NEXT: add t4, t4, t5
+; RV32-NEXT: and t3, t4, t3
+; RV32-NEXT: slli t4, t3, 8
+; RV32-NEXT: add t3, t3, t4
+; RV32-NEXT: slli t4, t3, 16
+; RV32-NEXT: add t3, t3, t4
+; RV32-NEXT: srli t3, t3, 24
+; RV32-NEXT: addi t3, t3, 32
+; RV32-NEXT: .LBB3_28: # %_udiv-special-cases
+; RV32-NEXT: xori t4, s0, 1
+; RV32-NEXT: addi s2, s2, -1
+; RV32-NEXT: bnez s4, .LBB3_30
+; RV32-NEXT: # %bb.29: # %_udiv-special-cases
+; RV32-NEXT: mv t3, a5
+; RV32-NEXT: .LBB3_30: # %_udiv-special-cases
+; RV32-NEXT: andi s11, a1, 1
+; RV32-NEXT: andi s8, t2, 1
+; RV32-NEXT: lw a1, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw a5, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT: or s9, a1, a5
+; RV32-NEXT: or t2, a7, a6
+; RV32-NEXT: neg a1, t4
+; RV32-NEXT: and s0, s2, s0
+; RV32-NEXT: bnez a4, .LBB3_32
+; RV32-NEXT: # %bb.31: # %_udiv-special-cases
+; RV32-NEXT: mv t3, a3
+; RV32-NEXT: .LBB3_32: # %_udiv-special-cases
+; RV32-NEXT: lw a3, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw a5, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT: or s10, a3, a5
+; RV32-NEXT: or a5, s9, s8
+; RV32-NEXT: or t4, t0, t1
+; RV32-NEXT: or t5, t2, s11
+; RV32-NEXT: and a1, s0, a1
+; RV32-NEXT: xori a3, a0, 1
+; RV32-NEXT: snez a4, a4
+; RV32-NEXT: neg a3, a3
+; RV32-NEXT: addi a4, a4, -1
+; RV32-NEXT: and a0, a4, a0
+; RV32-NEXT: sltu a4, s1, t3
+; RV32-NEXT: and t2, a0, a3
+; RV32-NEXT: mv a3, a4
+; RV32-NEXT: beq t6, a2, .LBB3_34
+; RV32-NEXT: # %bb.33: # %_udiv-special-cases
+; RV32-NEXT: sltu a3, t6, a2
+; RV32-NEXT: .LBB3_34: # %_udiv-special-cases
+; RV32-NEXT: or a0, a5, s10
+; RV32-NEXT: or t5, t5, t4
+; RV32-NEXT: sltu t4, a1, t2
+; RV32-NEXT: mv s0, a3
+; RV32-NEXT: beq a1, t2, .LBB3_36
+; RV32-NEXT: # %bb.35: # %_udiv-special-cases
+; RV32-NEXT: mv s0, t4
+; RV32-NEXT: .LBB3_36: # %_udiv-special-cases
+; RV32-NEXT: seqz a5, a0
+; RV32-NEXT: seqz t5, t5
+; RV32-NEXT: andi a0, s0, 1
+; RV32-NEXT: sub a2, t6, a2
+; RV32-NEXT: sub a1, a1, t2
+; RV32-NEXT: sub t2, a2, a4
+; RV32-NEXT: sltu a2, a1, a3
+; RV32-NEXT: add a2, t4, a2
+; RV32-NEXT: neg t4, a2
+; RV32-NEXT: sub a4, a1, a3
+; RV32-NEXT: or a1, a4, t4
+; RV32-NEXT: sub a3, s1, t3
+; RV32-NEXT: beqz a1, .LBB3_38
+; RV32-NEXT: # %bb.37: # %_udiv-special-cases
+; RV32-NEXT: snez a1, a1
+; RV32-NEXT: or a2, a5, t5
+; RV32-NEXT: bnez a0, .LBB3_39
+; RV32-NEXT: j .LBB3_40
+; RV32-NEXT: .LBB3_38:
+; RV32-NEXT: snez a1, t2
+; RV32-NEXT: sltiu a2, a3, 129
+; RV32-NEXT: xori a2, a2, 1
+; RV32-NEXT: or a1, a2, a1
+; RV32-NEXT: or a2, a5, t5
+; RV32-NEXT: beqz a0, .LBB3_40
+; RV32-NEXT: .LBB3_39: # %_udiv-special-cases
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: .LBB3_40: # %_udiv-special-cases
+; RV32-NEXT: or t6, a2, a1
+; RV32-NEXT: addi a1, t6, -1
+; RV32-NEXT: and a2, s11, a1
+; RV32-NEXT: and a5, a1, t1
+; RV32-NEXT: and t3, a1, a6
+; RV32-NEXT: and t5, a1, t0
+; RV32-NEXT: and a1, a1, a7
+; RV32-NEXT: bnez t6, .LBB3_57
+; RV32-NEXT: # %bb.41: # %_udiv-special-cases
+; RV32-NEXT: or t6, t2, t4
+; RV32-NEXT: xori s0, a3, 128
+; RV32-NEXT: or s0, s0, a0
+; RV32-NEXT: or s0, s0, a4
+; RV32-NEXT: or t6, s0, t6
+; RV32-NEXT: beqz t6, .LBB3_57
+; RV32-NEXT: # %bb.42: # %udiv-bb1
+; RV32-NEXT: sw ra, 8(sp) # 4-byte Folded Spill
+; RV32-NEXT: addi a1, a3, 1
+; RV32-NEXT: sw zero, 136(sp)
+; RV32-NEXT: sw zero, 140(sp)
+; RV32-NEXT: sw zero, 144(sp)
+; RV32-NEXT: sw zero, 148(sp)
+; RV32-NEXT: sw zero, 120(sp)
+; RV32-NEXT: sw zero, 124(sp)
+; RV32-NEXT: sw zero, 128(sp)
+; RV32-NEXT: sw zero, 132(sp)
+; RV32-NEXT: sw a7, 152(sp)
+; RV32-NEXT: sw t0, 156(sp)
+; RV32-NEXT: sw a6, 160(sp)
+; RV32-NEXT: sw t1, 164(sp)
+; RV32-NEXT: sw s11, 168(sp)
+; RV32-NEXT: li a5, 128
+; RV32-NEXT: addi t3, sp, 152
+; RV32-NEXT: neg a2, a3
+; RV32-NEXT: seqz t5, a1
+; RV32-NEXT: sub a5, a5, a3
+; RV32-NEXT: add t2, t2, t5
+; RV32-NEXT: andi a3, a5, 31
+; RV32-NEXT: srli t5, a5, 3
+; RV32-NEXT: or t6, a1, t2
+; RV32-NEXT: xori a5, a3, 31
+; RV32-NEXT: andi a3, t5, 28
+; RV32-NEXT: seqz t6, t6
+; RV32-NEXT: sub ra, t3, a3
+; RV32-NEXT: add t6, a4, t6
+; RV32-NEXT: lw t3, 0(ra)
+; RV32-NEXT: lw s0, 4(ra)
+; RV32-NEXT: lw s1, 8(ra)
+; RV32-NEXT: lw a3, 12(ra)
+; RV32-NEXT: sltu a4, t6, a4
+; RV32-NEXT: or t5, a1, t6
+; RV32-NEXT: add t4, t4, a4
+; RV32-NEXT: or a4, t2, t4
+; RV32-NEXT: or a4, t5, a4
+; RV32-NEXT: srli t5, s1, 1
+; RV32-NEXT: seqz s2, a4
+; RV32-NEXT: add a0, a0, s2
+; RV32-NEXT: sll s2, a3, a2
+; RV32-NEXT: srl t5, t5, a5
+; RV32-NEXT: or t5, s2, t5
+; RV32-NEXT: srli s2, s0, 1
+; RV32-NEXT: sll s1, s1, a2
+; RV32-NEXT: srl s2, s2, a5
+; RV32-NEXT: or s2, s1, s2
+; RV32-NEXT: srli s1, t3, 1
+; RV32-NEXT: sll s0, s0, a2
+; RV32-NEXT: srl s1, s1, a5
+; RV32-NEXT: andi s3, a0, 1
+; RV32-NEXT: or s1, s0, s1
+; RV32-NEXT: or a0, a4, s3
+; RV32-NEXT: sll t3, t3, a2
+; RV32-NEXT: beqz a0, .LBB3_55
+; RV32-NEXT: # %bb.43: # %udiv-preheader
+; RV32-NEXT: sw zero, 52(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw zero, 48(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw zero, 44(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw zero, 40(sp) # 4-byte Folded Spill
+; RV32-NEXT: li s7, 0
+; RV32-NEXT: srli a3, a3, 1
+; RV32-NEXT: lw a0, 16(ra)
+; RV32-NEXT: sw zero, 104(sp)
+; RV32-NEXT: sw zero, 108(sp)
+; RV32-NEXT: sw zero, 112(sp)
+; RV32-NEXT: sw zero, 116(sp)
+; RV32-NEXT: sw zero, 88(sp)
+; RV32-NEXT: sw zero, 92(sp)
+; RV32-NEXT: sw zero, 96(sp)
+; RV32-NEXT: sw zero, 100(sp)
+; RV32-NEXT: sw s11, 72(sp)
+; RV32-NEXT: sw zero, 76(sp)
+; RV32-NEXT: sw zero, 80(sp)
+; RV32-NEXT: sw zero, 84(sp)
+; RV32-NEXT: sw a7, 56(sp)
+; RV32-NEXT: sw t0, 60(sp)
+; RV32-NEXT: sw a6, 64(sp)
+; RV32-NEXT: sw t1, 68(sp)
+; RV32-NEXT: srli a4, a1, 3
+; RV32-NEXT: addi a6, sp, 56
+; RV32-NEXT: andi a7, a1, 31
+; RV32-NEXT: or t0, s9, s10
+; RV32-NEXT: srl a3, a3, a5
+; RV32-NEXT: andi a4, a4, 28
+; RV32-NEXT: xori a5, a7, 31
+; RV32-NEXT: snez a7, t0
+; RV32-NEXT: add a4, a6, a4
+; RV32-NEXT: add a7, s8, a7
+; RV32-NEXT: lw a6, 16(a4)
+; RV32-NEXT: lw t0, 0(a4)
+; RV32-NEXT: lw t1, 4(a4)
+; RV32-NEXT: lw s0, 8(a4)
+; RV32-NEXT: lw a4, 12(a4)
+; RV32-NEXT: sll a0, a0, a2
+; RV32-NEXT: or a3, a0, a3
+; RV32-NEXT: slli a6, a6, 1
+; RV32-NEXT: slli a0, a4, 1
+; RV32-NEXT: slli a2, s0, 1
+; RV32-NEXT: slli s4, t1, 1
+; RV32-NEXT: sll a6, a6, a5
+; RV32-NEXT: sll a0, a0, a5
+; RV32-NEXT: sll s8, a2, a5
+; RV32-NEXT: sll s4, s4, a5
+; RV32-NEXT: srl a2, a4, a1
+; RV32-NEXT: or ra, a2, a6
+; RV32-NEXT: lw a6, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT: seqz a4, a6
+; RV32-NEXT: srl a2, s0, a1
+; RV32-NEXT: or a2, a2, a0
+; RV32-NEXT: lw a5, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT: or a0, a6, a5
+; RV32-NEXT: sub s5, a5, a4
+; RV32-NEXT: seqz a4, a0
+; RV32-NEXT: srl a0, t1, a1
+; RV32-NEXT: or a0, a0, s8
+; RV32-NEXT: lw a5, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT: sub t1, a5, a4
+; RV32-NEXT: sw t1, 36(sp) # 4-byte Folded Spill
+; RV32-NEXT: sltu a4, a5, a4
+; RV32-NEXT: addi a7, a7, 1
+; RV32-NEXT: lw a5, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT: sub s6, a5, a4
+; RV32-NEXT: andi a4, a7, 1
+; RV32-NEXT: sw a4, 16(sp) # 4-byte Folded Spill
+; RV32-NEXT: andi a5, a3, 1
+; RV32-NEXT: srl a3, t0, a1
+; RV32-NEXT: or a4, a3, s4
+; RV32-NEXT: addi a6, a6, -1
+; RV32-NEXT: sw a6, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: li s11, 0
+; RV32-NEXT: li s10, 0
+; RV32-NEXT: j .LBB3_45
+; RV32-NEXT: .LBB3_44: # %udiv-do-while
+; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT: lw s0, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT: and s0, a5, s0
+; RV32-NEXT: xor s8, t1, a7
+; RV32-NEXT: xor s9, a2, s0
+; RV32-NEXT: or s8, s9, s8
+; RV32-NEXT: li s9, 0
+; RV32-NEXT: li s8, 0
+; RV32-NEXT: sltu s4, a2, s0
+; RV32-NEXT: sub s0, a2, s0
+; RV32-NEXT: sub a7, t1, a7
+; RV32-NEXT: srli a2, s2, 31
+; RV32-NEXT: sub a0, a0, t0
+; RV32-NEXT: slli t0, t5, 1
+; RV32-NEXT: or t0, t0, a2
+; RV32-NEXT: srli a2, s1, 31
+; RV32-NEXT: slli s2, s2, 1
+; RV32-NEXT: or t1, s2, a2
+; RV32-NEXT: srli a2, t3, 31
+; RV32-NEXT: slli s1, s1, 1
+; RV32-NEXT: or s1, s1, a2
+; RV32-NEXT: slli t3, t3, 1
+; RV32-NEXT: lw a2, 52(sp) # 4-byte Folded Reload
+; RV32-NEXT: or t3, a2, t3
+; RV32-NEXT: srli a2, t5, 31
+; RV32-NEXT: or s7, s7, a2
+; RV32-NEXT: sub a2, s0, ra
+; RV32-NEXT: sltu s0, s0, ra
+; RV32-NEXT: or t5, a1, t6
+; RV32-NEXT: sub a7, a7, s4
+; RV32-NEXT: or s2, t2, t4
+; RV32-NEXT: sub a0, a0, a6
+; RV32-NEXT: or a6, a1, t2
+; RV32-NEXT: or s4, t5, s2
+; RV32-NEXT: seqz t5, a1
+; RV32-NEXT: addi a1, a1, -1
+; RV32-NEXT: andi a5, a5, 1
+; RV32-NEXT: sw a5, 52(sp) # 4-byte Folded Spill
+; RV32-NEXT: seqz a6, a6
+; RV32-NEXT: sub t2, t2, t5
+; RV32-NEXT: lw a5, 48(sp) # 4-byte Folded Reload
+; RV32-NEXT: or s1, a5, s1
+; RV32-NEXT: lw a5, 44(sp) # 4-byte Folded Reload
+; RV32-NEXT: or s2, a5, t1
+; RV32-NEXT: lw a5, 40(sp) # 4-byte Folded Reload
+; RV32-NEXT: or t5, a5, t0
+; RV32-NEXT: andi a5, s7, 1
+; RV32-NEXT: sub ra, a7, s0
+; RV32-NEXT: snez a7, s4
+; RV32-NEXT: sltu t0, t6, a6
+; RV32-NEXT: sub t6, t6, a6
+; RV32-NEXT: add a7, s3, a7
+; RV32-NEXT: sub t4, t4, t0
+; RV32-NEXT: or a6, a1, t6
+; RV32-NEXT: addi a7, a7, 1
+; RV32-NEXT: or t0, t2, t4
+; RV32-NEXT: andi s3, a7, 1
+; RV32-NEXT: or a6, a6, t0
+; RV32-NEXT: or a6, a6, s3
+; RV32-NEXT: sub a4, a4, a3
+; RV32-NEXT: sw zero, 48(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw zero, 44(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw zero, 40(sp) # 4-byte Folded Spill
+; RV32-NEXT: li s7, 0
+; RV32-NEXT: beqz a6, .LBB3_56
+; RV32-NEXT: .LBB3_45: # %udiv-do-while
+; RV32-NEXT: # =>This Inner Loop Header: Depth=1
+; RV32-NEXT: srli a3, a2, 31
+; RV32-NEXT: slli a6, ra, 1
+; RV32-NEXT: or t1, a6, a3
+; RV32-NEXT: srli a3, a0, 31
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: or a2, a2, a3
+; RV32-NEXT: beq s6, t1, .LBB3_47
+; RV32-NEXT: # %bb.46: # %udiv-do-while
+; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT: sltu a3, s6, t1
+; RV32-NEXT: j .LBB3_48
+; RV32-NEXT: .LBB3_47: # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT: lw a3, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT: sltu a3, a3, a2
+; RV32-NEXT: .LBB3_48: # %udiv-do-while
+; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT: srli a6, a4, 31
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: slli a4, a4, 1
+; RV32-NEXT: or a0, a0, a6
+; RV32-NEXT: andi a5, a5, 1
+; RV32-NEXT: or a4, a4, a5
+; RV32-NEXT: beq s5, a0, .LBB3_50
+; RV32-NEXT: # %bb.49: # %udiv-do-while
+; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT: sltu a5, s5, a0
+; RV32-NEXT: j .LBB3_51
+; RV32-NEXT: .LBB3_50: # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT: lw a5, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT: sltu a5, a5, a4
+; RV32-NEXT: .LBB3_51: # %udiv-do-while
+; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT: lw a6, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT: xor a6, a6, a2
+; RV32-NEXT: xor a7, s6, t1
+; RV32-NEXT: or a6, a6, a7
+; RV32-NEXT: beqz a6, .LBB3_53
+; RV32-NEXT: # %bb.52: # %udiv-do-while
+; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT: mv a5, a3
+; RV32-NEXT: .LBB3_53: # %udiv-do-while
+; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT: srli a3, ra, 31
+; RV32-NEXT: lw a6, 16(sp) # 4-byte Folded Reload
+; RV32-NEXT: sub a3, a6, a3
+; RV32-NEXT: sub a3, a3, a5
+; RV32-NEXT: slli a3, a3, 31
+; RV32-NEXT: srai a5, a3, 31
+; RV32-NEXT: lw a3, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT: and a7, a5, a3
+; RV32-NEXT: lw a3, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT: and a3, a5, a3
+; RV32-NEXT: lw a6, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT: and t0, a5, a6
+; RV32-NEXT: sltu a6, a4, a3
+; RV32-NEXT: mv ra, a6
+; RV32-NEXT: beq a0, t0, .LBB3_44
+; RV32-NEXT: # %bb.54: # %udiv-do-while
+; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT: sltu ra, a0, t0
+; RV32-NEXT: j .LBB3_44
+; RV32-NEXT: .LBB3_55:
+; RV32-NEXT: sw zero, 52(sp) # 4-byte Folded Spill
+; RV32-NEXT: li s11, 0
+; RV32-NEXT: li s9, 0
+; RV32-NEXT: li s10, 0
+; RV32-NEXT: li s8, 0
+; RV32-NEXT: .LBB3_56: # %udiv-loop-exit
+; RV32-NEXT: srli a0, s2, 31
+; RV32-NEXT: slli a1, t5, 1
+; RV32-NEXT: or a0, a1, a0
+; RV32-NEXT: srli a1, s1, 31
+; RV32-NEXT: slli s2, s2, 1
+; RV32-NEXT: or a2, s2, a1
+; RV32-NEXT: srli a3, t3, 31
+; RV32-NEXT: slli s1, s1, 1
+; RV32-NEXT: srli a4, t5, 31
+; RV32-NEXT: slli t3, t3, 1
+; RV32-NEXT: lw a1, 52(sp) # 4-byte Folded Reload
+; RV32-NEXT: or a1, a1, t3
+; RV32-NEXT: or a3, s11, a3
+; RV32-NEXT: or a4, s8, a4
+; RV32-NEXT: or t5, a3, s1
+; RV32-NEXT: or t3, s9, a2
+; RV32-NEXT: or a5, s10, a0
+; RV32-NEXT: andi a2, a4, 1
+; RV32-NEXT: lw ra, 8(sp) # 4-byte Folded Reload
+; RV32-NEXT: .LBB3_57: # %udiv-end
+; RV32-NEXT: sw a1, 0(ra)
+; RV32-NEXT: sw t5, 4(ra)
+; RV32-NEXT: sw t3, 8(ra)
+; RV32-NEXT: sw a5, 12(ra)
+; RV32-NEXT: andi a2, a2, 1
+; RV32-NEXT: sb a2, 16(ra)
+; RV32-NEXT: lw ra, 236(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 232(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s1, 228(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s2, 224(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s3, 220(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s4, 216(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s5, 212(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s6, 208(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s7, 204(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s8, 200(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s9, 196(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s10, 192(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s11, 188(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 240
+; RV32-NEXT: ret
+;
+; RV64-LABEL: udiv_i129:
+; RV64: # %bb.0: # %_udiv-special-cases
+; RV64-NEXT: ld a3, 0(a2)
+; RV64-NEXT: ld a4, 8(a2)
+; RV64-NEXT: ld t1, 16(a2)
+; RV64-NEXT: lui a2, 349525
+; RV64-NEXT: lui a5, 209715
+; RV64-NEXT: lui a6, 61681
+; RV64-NEXT: addi t0, a2, 1365
+; RV64-NEXT: addi a7, a5, 819
+; RV64-NEXT: addi a6, a6, -241
+; RV64-NEXT: slli a2, t0, 32
+; RV64-NEXT: slli a5, a7, 32
+; RV64-NEXT: slli t2, a6, 32
+; RV64-NEXT: add t0, t0, a2
+; RV64-NEXT: add a7, a7, a5
+; RV64-NEXT: add a6, a6, t2
+; RV64-NEXT: srli a2, a4, 1
+; RV64-NEXT: slli a5, t1, 63
+; RV64-NEXT: slli t2, a4, 63
+; RV64-NEXT: or t3, a5, a2
+; RV64-NEXT: srli a2, a3, 1
+; RV64-NEXT: or t4, a2, t2
+; RV64-NEXT: bnez t3, .LBB3_2
+; RV64-NEXT: # %bb.1: # %_udiv-special-cases
+; RV64-NEXT: srli a2, t4, 1
+; RV64-NEXT: or a2, t4, a2
+; RV64-NEXT: srli a5, a2, 2
+; RV64-NEXT: or a2, a2, a5
+; RV64-NEXT: srli a5, a2, 4
+; RV64-NEXT: or a2, a2, a5
+; RV64-NEXT: srli a5, a2, 8
+; RV64-NEXT: or a2, a2, a5
+; RV64-NEXT: srli a5, a2, 16
+; RV64-NEXT: or a2, a2, a5
+; RV64-NEXT: srli a5, a2, 32
+; RV64-NEXT: or a2, a2, a5
+; RV64-NEXT: not a2, a2
+; RV64-NEXT: srli a5, a2, 1
+; RV64-NEXT: and a5, a5, t0
+; RV64-NEXT: sub a2, a2, a5
+; RV64-NEXT: and a5, a2, a7
+; RV64-NEXT: srli a2, a2, 2
+; RV64-NEXT: and a2, a2, a7
+; RV64-NEXT: add a2, a5, a2
+; RV64-NEXT: srli a5, a2, 4
+; RV64-NEXT: add a2, a2, a5
+; RV64-NEXT: and a2, a2, a6
+; RV64-NEXT: slli a5, a2, 8
+; RV64-NEXT: add a2, a2, a5
+; RV64-NEXT: slli a5, a2, 16
+; RV64-NEXT: add a2, a2, a5
+; RV64-NEXT: slli a5, a2, 32
+; RV64-NEXT: add a2, a2, a5
+; RV64-NEXT: srli a2, a2, 56
+; RV64-NEXT: addi t2, a2, 64
+; RV64-NEXT: j .LBB3_3
+; RV64-NEXT: .LBB3_2:
+; RV64-NEXT: srli a2, t3, 1
+; RV64-NEXT: or a2, t3, a2
+; RV64-NEXT: srli a5, a2, 2
+; RV64-NEXT: or a2, a2, a5
+; RV64-NEXT: srli a5, a2, 4
+; RV64-NEXT: or a2, a2, a5
+; RV64-NEXT: srli a5, a2, 8
+; RV64-NEXT: or a2, a2, a5
+; RV64-NEXT: srli a5, a2, 16
+; RV64-NEXT: or a2, a2, a5
+; RV64-NEXT: srli a5, a2, 32
+; RV64-NEXT: or a2, a2, a5
+; RV64-NEXT: not a2, a2
+; RV64-NEXT: srli a5, a2, 1
+; RV64-NEXT: and a5, a5, t0
+; RV64-NEXT: sub a2, a2, a5
+; RV64-NEXT: and a5, a2, a7
+; RV64-NEXT: srli a2, a2, 2
+; RV64-NEXT: and a2, a2, a7
+; RV64-NEXT: add a2, a5, a2
+; RV64-NEXT: srli a5, a2, 4
+; RV64-NEXT: add a2, a2, a5
+; RV64-NEXT: and a2, a2, a6
+; RV64-NEXT: slli a5, a2, 8
+; RV64-NEXT: add a2, a2, a5
+; RV64-NEXT: slli a5, a2, 16
+; RV64-NEXT: add a2, a2, a5
+; RV64-NEXT: slli a5, a2, 32
+; RV64-NEXT: add a2, a2, a5
+; RV64-NEXT: srli t2, a2, 56
+; RV64-NEXT: .LBB3_3: # %_udiv-special-cases
+; RV64-NEXT: addi sp, sp, -192
+; RV64-NEXT: sd s0, 184(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s1, 176(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s2, 168(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s3, 160(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s4, 152(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s5, 144(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s6, 136(sp) # 8-byte Folded Spill
+; RV64-NEXT: slli a2, a3, 63
+; RV64-NEXT: li t5, 128
+; RV64-NEXT: bnez a2, .LBB3_5
+; RV64-NEXT: # %bb.4: # %_udiv-special-cases
+; RV64-NEXT: li s0, 128
+; RV64-NEXT: j .LBB3_6
+; RV64-NEXT: .LBB3_5:
+; RV64-NEXT: srli a5, a2, 1
+; RV64-NEXT: or a2, a2, a5
+; RV64-NEXT: srli a5, a2, 2
+; RV64-NEXT: or a2, a2, a5
+; RV64-NEXT: srli a5, a2, 4
+; RV64-NEXT: or a2, a2, a5
+; RV64-NEXT: srli a5, a2, 8
+; RV64-NEXT: or a2, a2, a5
+; RV64-NEXT: srli a5, a2, 16
+; RV64-NEXT: or a2, a2, a5
+; RV64-NEXT: srli a5, a2, 32
+; RV64-NEXT: or a2, a2, a5
+; RV64-NEXT: not a2, a2
+; RV64-NEXT: srli a5, a2, 1
+; RV64-NEXT: and a5, a5, t0
+; RV64-NEXT: sub a2, a2, a5
+; RV64-NEXT: and a5, a2, a7
+; RV64-NEXT: srli a2, a2, 2
+; RV64-NEXT: and a2, a2, a7
+; RV64-NEXT: add a2, a5, a2
+; RV64-NEXT: srli a5, a2, 4
+; RV64-NEXT: add a2, a2, a5
+; RV64-NEXT: and a2, a2, a6
+; RV64-NEXT: slli a5, a2, 8
+; RV64-NEXT: add a2, a2, a5
+; RV64-NEXT: slli a5, a2, 16
+; RV64-NEXT: add a2, a2, a5
+; RV64-NEXT: slli a5, a2, 32
+; RV64-NEXT: add a2, a2, a5
+; RV64-NEXT: srli s0, a2, 56
+; RV64-NEXT: .LBB3_6: # %_udiv-special-cases
+; RV64-NEXT: ld a5, 0(a1)
+; RV64-NEXT: ld a2, 8(a1)
+; RV64-NEXT: ld s2, 16(a1)
+; RV64-NEXT: or a1, t4, t3
+; RV64-NEXT: addi s1, s0, 128
+; RV64-NEXT: bnez a1, .LBB3_8
+; RV64-NEXT: # %bb.7: # %_udiv-special-cases
+; RV64-NEXT: mv t2, s1
+; RV64-NEXT: .LBB3_8: # %_udiv-special-cases
+; RV64-NEXT: snez s3, a1
+; RV64-NEXT: srli a1, a2, 1
+; RV64-NEXT: slli t3, s2, 63
+; RV64-NEXT: slli t4, a2, 63
+; RV64-NEXT: or a1, t3, a1
+; RV64-NEXT: srli t3, a5, 1
+; RV64-NEXT: or t6, t3, t4
+; RV64-NEXT: bnez a1, .LBB3_10
+; RV64-NEXT: # %bb.9: # %_udiv-special-cases
+; RV64-NEXT: srli t3, t6, 1
+; RV64-NEXT: or t3, t6, t3
+; RV64-NEXT: srli t4, t3, 2
+; RV64-NEXT: or t3, t3, t4
+; RV64-NEXT: srli t4, t3, 4
+; RV64-NEXT: or t3, t3, t4
+; RV64-NEXT: srli t4, t3, 8
+; RV64-NEXT: or t3, t3, t4
+; RV64-NEXT: srli t4, t3, 16
+; RV64-NEXT: or t3, t3, t4
+; RV64-NEXT: srli t4, t3, 32
+; RV64-NEXT: or t3, t3, t4
+; RV64-NEXT: not t3, t3
+; RV64-NEXT: srli t4, t3, 1
+; RV64-NEXT: and t4, t4, t0
+; RV64-NEXT: sub t3, t3, t4
+; RV64-NEXT: and t4, t3, a7
+; RV64-NEXT: srli t3, t3, 2
+; RV64-NEXT: and t3, t3, a7
+; RV64-NEXT: add t3, t4, t3
+; RV64-NEXT: srli t4, t3, 4
+; RV64-NEXT: add t3, t3, t4
+; RV64-NEXT: and t3, t3, a6
+; RV64-NEXT: slli t4, t3, 8
+; RV64-NEXT: add t3, t3, t4
+; RV64-NEXT: slli t4, t3, 16
+; RV64-NEXT: add t3, t3, t4
+; RV64-NEXT: slli t4, t3, 32
+; RV64-NEXT: add t3, t3, t4
+; RV64-NEXT: srli t3, t3, 56
+; RV64-NEXT: addi s4, t3, 64
+; RV64-NEXT: j .LBB3_11
+; RV64-NEXT: .LBB3_10:
+; RV64-NEXT: srli t3, a1, 1
+; RV64-NEXT: or t3, a1, t3
+; RV64-NEXT: srli t4, t3, 2
+; RV64-NEXT: or t3, t3, t4
+; RV64-NEXT: srli t4, t3, 4
+; RV64-NEXT: or t3, t3, t4
+; RV64-NEXT: srli t4, t3, 8
+; RV64-NEXT: or t3, t3, t4
+; RV64-NEXT: srli t4, t3, 16
+; RV64-NEXT: or t3, t3, t4
+; RV64-NEXT: srli t4, t3, 32
+; RV64-NEXT: or t3, t3, t4
+; RV64-NEXT: not t3, t3
+; RV64-NEXT: srli t4, t3, 1
+; RV64-NEXT: and t4, t4, t0
+; RV64-NEXT: sub t3, t3, t4
+; RV64-NEXT: and t4, t3, a7
+; RV64-NEXT: srli t3, t3, 2
+; RV64-NEXT: and t3, t3, a7
+; RV64-NEXT: add t3, t4, t3
+; RV64-NEXT: srli t4, t3, 4
+; RV64-NEXT: add t3, t3, t4
+; RV64-NEXT: and t3, t3, a6
+; RV64-NEXT: slli t4, t3, 8
+; RV64-NEXT: add t3, t3, t4
+; RV64-NEXT: slli t4, t3, 16
+; RV64-NEXT: add t3, t3, t4
+; RV64-NEXT: slli t4, t3, 32
+; RV64-NEXT: add t3, t3, t4
+; RV64-NEXT: srli s4, t3, 56
+; RV64-NEXT: .LBB3_11: # %_udiv-special-cases
+; RV64-NEXT: andi t4, s2, 1
+; RV64-NEXT: andi t1, t1, 1
+; RV64-NEXT: or t3, a3, a4
+; RV64-NEXT: or s2, a5, a2
+; RV64-NEXT: sltu s0, s1, s0
+; RV64-NEXT: slli s1, a5, 63
+; RV64-NEXT: addi s3, s3, -1
+; RV64-NEXT: beqz s1, .LBB3_13
+; RV64-NEXT: # %bb.12:
+; RV64-NEXT: srli t5, s1, 1
+; RV64-NEXT: or t5, s1, t5
+; RV64-NEXT: srli s1, t5, 2
+; RV64-NEXT: or t5, t5, s1
+; RV64-NEXT: srli s1, t5, 4
+; RV64-NEXT: or t5, t5, s1
+; RV64-NEXT: srli s1, t5, 8
+; RV64-NEXT: or t5, t5, s1
+; RV64-NEXT: srli s1, t5, 16
+; RV64-NEXT: or t5, t5, s1
+; RV64-NEXT: srli s1, t5, 32
+; RV64-NEXT: or t5, t5, s1
+; RV64-NEXT: not t5, t5
+; RV64-NEXT: srli s1, t5, 1
+; RV64-NEXT: and t0, s1, t0
+; RV64-NEXT: sub t0, t5, t0
+; RV64-NEXT: and t5, t0, a7
+; RV64-NEXT: srli t0, t0, 2
+; RV64-NEXT: and a7, t0, a7
+; RV64-NEXT: add a7, t5, a7
+; RV64-NEXT: srli t0, a7, 4
+; RV64-NEXT: add a7, a7, t0
+; RV64-NEXT: and a6, a7, a6
+; RV64-NEXT: slli a7, a6, 8
+; RV64-NEXT: add a6, a6, a7
+; RV64-NEXT: slli a7, a6, 16
+; RV64-NEXT: add a6, a6, a7
+; RV64-NEXT: slli a7, a6, 32
+; RV64-NEXT: add a6, a6, a7
+; RV64-NEXT: srli t5, a6, 56
+; RV64-NEXT: .LBB3_13: # %_udiv-special-cases
+; RV64-NEXT: or t0, t3, t1
+; RV64-NEXT: or a6, s2, t4
+; RV64-NEXT: and a7, s3, s0
+; RV64-NEXT: or t6, t6, a1
+; RV64-NEXT: addi s0, t5, 128
+; RV64-NEXT: bnez t6, .LBB3_15
+; RV64-NEXT: # %bb.14: # %_udiv-special-cases
+; RV64-NEXT: mv s4, s0
+; RV64-NEXT: .LBB3_15: # %_udiv-special-cases
+; RV64-NEXT: seqz a1, t0
+; RV64-NEXT: sltu t0, s0, t5
+; RV64-NEXT: snez t5, t6
+; RV64-NEXT: addi t5, t5, -1
+; RV64-NEXT: and t0, t5, t0
+; RV64-NEXT: sltu t5, t2, s4
+; RV64-NEXT: seqz a6, a6
+; RV64-NEXT: mv t6, t5
+; RV64-NEXT: beq a7, t0, .LBB3_17
+; RV64-NEXT: # %bb.16: # %_udiv-special-cases
+; RV64-NEXT: sltu t6, a7, t0
+; RV64-NEXT: .LBB3_17: # %_udiv-special-cases
+; RV64-NEXT: or a1, a1, a6
+; RV64-NEXT: andi a6, t6, 1
+; RV64-NEXT: sub a7, a7, t0
+; RV64-NEXT: sub t5, a7, t5
+; RV64-NEXT: sub a7, t2, s4
+; RV64-NEXT: beqz a6, .LBB3_19
+; RV64-NEXT: # %bb.18: # %_udiv-special-cases
+; RV64-NEXT: mv t0, a6
+; RV64-NEXT: j .LBB3_20
+; RV64-NEXT: .LBB3_19:
+; RV64-NEXT: sltiu t0, a7, 129
+; RV64-NEXT: xori t0, t0, 1
+; RV64-NEXT: snez t2, t5
+; RV64-NEXT: or t0, t0, t2
+; RV64-NEXT: .LBB3_20: # %_udiv-special-cases
+; RV64-NEXT: or t6, a1, t0
+; RV64-NEXT: addi a1, t6, -1
+; RV64-NEXT: and t2, t4, a1
+; RV64-NEXT: and t0, a1, a2
+; RV64-NEXT: and a1, a1, a5
+; RV64-NEXT: bnez t6, .LBB3_30
+; RV64-NEXT: # %bb.21: # %_udiv-special-cases
+; RV64-NEXT: xori t6, a7, 128
+; RV64-NEXT: or t6, t6, a6
+; RV64-NEXT: or t6, t6, t5
+; RV64-NEXT: beqz t6, .LBB3_30
+; RV64-NEXT: # %bb.22: # %udiv-bb1
+; RV64-NEXT: addi a1, a7, 1
+; RV64-NEXT: sd zero, 64(sp)
+; RV64-NEXT: sd zero, 72(sp)
+; RV64-NEXT: sd zero, 80(sp)
+; RV64-NEXT: sd zero, 88(sp)
+; RV64-NEXT: sd a5, 96(sp)
+; RV64-NEXT: sd a2, 104(sp)
+; RV64-NEXT: sd t4, 112(sp)
+; RV64-NEXT: li t0, 128
+; RV64-NEXT: addi t2, sp, 96
+; RV64-NEXT: neg s1, a7
+; RV64-NEXT: seqz t6, a1
+; RV64-NEXT: sub a7, t0, a7
+; RV64-NEXT: add t5, t5, t6
+; RV64-NEXT: andi t0, a7, 63
+; RV64-NEXT: srli a7, a7, 3
+; RV64-NEXT: or t6, a1, t5
+; RV64-NEXT: xori s2, t0, 63
+; RV64-NEXT: andi a7, a7, 24
+; RV64-NEXT: seqz t0, t6
+; RV64-NEXT: sub s3, t2, a7
+; RV64-NEXT: add a6, a6, t0
+; RV64-NEXT: ld t2, 0(s3)
+; RV64-NEXT: ld s4, 8(s3)
+; RV64-NEXT: andi a7, a6, 1
+; RV64-NEXT: or t6, t6, a7
+; RV64-NEXT: srli a6, t2, 1
+; RV64-NEXT: sll t0, s4, s1
+; RV64-NEXT: srl a6, a6, s2
+; RV64-NEXT: or t0, t0, a6
+; RV64-NEXT: sll a6, t2, s1
+; RV64-NEXT: li t2, 0
+; RV64-NEXT: beqz t6, .LBB3_28
+; RV64-NEXT: # %bb.23: # %udiv-preheader
+; RV64-NEXT: li t6, 0
+; RV64-NEXT: li s0, 0
+; RV64-NEXT: srli s4, s4, 1
+; RV64-NEXT: ld s3, 16(s3)
+; RV64-NEXT: sd zero, 32(sp)
+; RV64-NEXT: sd zero, 40(sp)
+; RV64-NEXT: sd zero, 48(sp)
+; RV64-NEXT: sd zero, 56(sp)
+; RV64-NEXT: sd a5, 0(sp)
+; RV64-NEXT: sd a2, 8(sp)
+; RV64-NEXT: sd t4, 16(sp)
+; RV64-NEXT: sd zero, 24(sp)
+; RV64-NEXT: srli a2, a1, 3
+; RV64-NEXT: srl a5, s4, s2
+; RV64-NEXT: mv t4, sp
+; RV64-NEXT: snez t3, t3
+; RV64-NEXT: andi a2, a2, 24
+; RV64-NEXT: add t1, t1, t3
+; RV64-NEXT: add a2, t4, a2
+; RV64-NEXT: ld t3, 0(a2)
+; RV64-NEXT: ld t4, 8(a2)
+; RV64-NEXT: ld a2, 16(a2)
+; RV64-NEXT: sll s1, s3, s1
+; RV64-NEXT: andi s2, a1, 63
+; RV64-NEXT: xori s2, s2, 63
+; RV64-NEXT: or s3, s1, a5
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: slli a5, t4, 1
+; RV64-NEXT: sll a2, a2, s2
+; RV64-NEXT: sll s2, a5, s2
+; RV64-NEXT: srl s1, t4, a1
+; RV64-NEXT: or s1, s1, a2
+; RV64-NEXT: seqz a2, a3
+; RV64-NEXT: sub a2, a4, a2
+; RV64-NEXT: addi a5, t1, 1
+; RV64-NEXT: andi a5, a5, 1
+; RV64-NEXT: andi s3, s3, 1
+; RV64-NEXT: srl t1, t3, a1
+; RV64-NEXT: or s2, t1, s2
+; RV64-NEXT: addi t1, a3, -1
+; RV64-NEXT: j .LBB3_26
+; RV64-NEXT: .LBB3_24: # %udiv-do-while
+; RV64-NEXT: # in Loop: Header=BB3_26 Depth=1
+; RV64-NEXT: sltu t3, a2, s4
+; RV64-NEXT: .LBB3_25: # %udiv-do-while
+; RV64-NEXT: # in Loop: Header=BB3_26 Depth=1
+; RV64-NEXT: srli s1, s1, 63
+; RV64-NEXT: sub t4, a5, s1
+; RV64-NEXT: sub t3, t4, t3
+; RV64-NEXT: slli t3, t3, 63
+; RV64-NEXT: srai s1, t3, 63
+; RV64-NEXT: and s3, s1, a4
+; RV64-NEXT: li t3, 0
+; RV64-NEXT: li t4, 0
+; RV64-NEXT: srli s5, a6, 63
+; RV64-NEXT: sub s4, s4, s3
+; RV64-NEXT: slli s3, t0, 1
+; RV64-NEXT: or s3, s3, s5
+; RV64-NEXT: srli t0, t0, 63
+; RV64-NEXT: slli a6, a6, 1
+; RV64-NEXT: or a6, t2, a6
+; RV64-NEXT: seqz t2, a1
+; RV64-NEXT: or s0, s0, t0
+; RV64-NEXT: or s5, a1, t5
+; RV64-NEXT: sub t5, t5, t2
+; RV64-NEXT: and s6, s1, a3
+; RV64-NEXT: addi a1, a1, -1
+; RV64-NEXT: andi t2, s1, 1
+; RV64-NEXT: or t0, t6, s3
+; RV64-NEXT: sltu t6, s2, s6
+; RV64-NEXT: snez s5, s5
+; RV64-NEXT: andi s3, s0, 1
+; RV64-NEXT: sub s1, s4, t6
+; RV64-NEXT: add a7, a7, s5
+; RV64-NEXT: addi a7, a7, 1
+; RV64-NEXT: andi a7, a7, 1
+; RV64-NEXT: or t6, a1, t5
+; RV64-NEXT: or s4, t6, a7
+; RV64-NEXT: sub s2, s2, s6
+; RV64-NEXT: li t6, 0
+; RV64-NEXT: li s0, 0
+; RV64-NEXT: beqz s4, .LBB3_29
+; RV64-NEXT: .LBB3_26: # %udiv-do-while
+; RV64-NEXT: # =>This Inner Loop Header: Depth=1
+; RV64-NEXT: srli t3, s2, 63
+; RV64-NEXT: slli t4, s1, 1
+; RV64-NEXT: slli s2, s2, 1
+; RV64-NEXT: or s4, t4, t3
+; RV64-NEXT: andi t3, s3, 1
+; RV64-NEXT: or s2, s2, t3
+; RV64-NEXT: bne a2, s4, .LBB3_24
+; RV64-NEXT: # %bb.27: # in Loop: Header=BB3_26 Depth=1
+; RV64-NEXT: sltu t3, t1, s2
+; RV64-NEXT: j .LBB3_25
+; RV64-NEXT: .LBB3_28:
+; RV64-NEXT: li t3, 0
+; RV64-NEXT: li t4, 0
+; RV64-NEXT: .LBB3_29: # %udiv-loop-exit
+; RV64-NEXT: srli a2, a6, 63
+; RV64-NEXT: slli a3, t0, 1
+; RV64-NEXT: srli a4, t0, 63
+; RV64-NEXT: slli a6, a6, 1
+; RV64-NEXT: or a1, t2, a6
+; RV64-NEXT: or a2, t3, a2
+; RV64-NEXT: or a4, t4, a4
+; RV64-NEXT: or t0, a2, a3
+; RV64-NEXT: andi t2, a4, 1
+; RV64-NEXT: .LBB3_30: # %udiv-end
+; RV64-NEXT: andi a2, t2, 1
+; RV64-NEXT: sd a1, 0(a0)
+; RV64-NEXT: sd t0, 8(a0)
+; RV64-NEXT: sb a2, 16(a0)
+; RV64-NEXT: ld s0, 184(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s1, 176(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s2, 168(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s3, 160(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s4, 152(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s5, 144(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s6, 136(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 192
+; RV64-NEXT: ret
%res = udiv i129 %x, %y
ret i129 %res
}
diff --git a/llvm/test/CodeGen/RISCV/min-max.ll b/llvm/test/CodeGen/RISCV/min-max.ll
index acde8ad..e7f6899 100644
--- a/llvm/test/CodeGen/RISCV/min-max.ll
+++ b/llvm/test/CodeGen/RISCV/min-max.ll
@@ -5,6 +5,12 @@
; RUN: FileCheck %s --check-prefixes=ZBB,RV32ZBB
; RUN: llc < %s -mtriple=riscv64 -mattr=+zbb | \
; RUN: FileCheck %s --check-prefixes=ZBB,RV64ZBB
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicm,+experimental-xqcics,+experimental-xqcicli,+zca,+short-forward-branch-opt,+conditional-cmv-fusion -verify-machineinstrs < %s | \
+; RUN: FileCheck %s --check-prefixes=XQCI
+; RUN: llc < %s -mtriple=riscv32 -mattr=+short-forward-branch-opt | \
+; RUN: FileCheck %s --check-prefixes=RV32I-SFB
+; RUN: llc < %s -mtriple=riscv64 -mattr=+short-forward-branch-opt | \
+; RUN: FileCheck %s --check-prefixes=RV64I-SFB
; Basic tests.
@@ -23,6 +29,27 @@ define signext i8 @smax_i8(i8 signext %a, i8 signext %b) {
; ZBB: # %bb.0:
; ZBB-NEXT: max a0, a0, a1
; ZBB-NEXT: ret
+;
+; XQCI-LABEL: smax_i8:
+; XQCI: # %bb.0:
+; XQCI-NEXT: qc.mvge a0, a1, a0, a1
+; XQCI-NEXT: ret
+;
+; RV32I-SFB-LABEL: smax_i8:
+; RV32I-SFB: # %bb.0:
+; RV32I-SFB-NEXT: blt a1, a0, .LBB0_2
+; RV32I-SFB-NEXT: # %bb.1:
+; RV32I-SFB-NEXT: mv a0, a1
+; RV32I-SFB-NEXT: .LBB0_2:
+; RV32I-SFB-NEXT: ret
+;
+; RV64I-SFB-LABEL: smax_i8:
+; RV64I-SFB: # %bb.0:
+; RV64I-SFB-NEXT: blt a1, a0, .LBB0_2
+; RV64I-SFB-NEXT: # %bb.1:
+; RV64I-SFB-NEXT: mv a0, a1
+; RV64I-SFB-NEXT: .LBB0_2:
+; RV64I-SFB-NEXT: ret
%c = call i8 @llvm.smax.i8(i8 %a, i8 %b)
ret i8 %c
}
@@ -42,6 +69,27 @@ define signext i16 @smax_i16(i16 signext %a, i16 signext %b) {
; ZBB: # %bb.0:
; ZBB-NEXT: max a0, a0, a1
; ZBB-NEXT: ret
+;
+; XQCI-LABEL: smax_i16:
+; XQCI: # %bb.0:
+; XQCI-NEXT: qc.mvge a0, a1, a0, a1
+; XQCI-NEXT: ret
+;
+; RV32I-SFB-LABEL: smax_i16:
+; RV32I-SFB: # %bb.0:
+; RV32I-SFB-NEXT: blt a1, a0, .LBB1_2
+; RV32I-SFB-NEXT: # %bb.1:
+; RV32I-SFB-NEXT: mv a0, a1
+; RV32I-SFB-NEXT: .LBB1_2:
+; RV32I-SFB-NEXT: ret
+;
+; RV64I-SFB-LABEL: smax_i16:
+; RV64I-SFB: # %bb.0:
+; RV64I-SFB-NEXT: blt a1, a0, .LBB1_2
+; RV64I-SFB-NEXT: # %bb.1:
+; RV64I-SFB-NEXT: mv a0, a1
+; RV64I-SFB-NEXT: .LBB1_2:
+; RV64I-SFB-NEXT: ret
%c = call i16 @llvm.smax.i16(i16 %a, i16 %b)
ret i16 %c
}
@@ -61,6 +109,27 @@ define signext i32 @smax_i32(i32 signext %a, i32 signext %b) {
; ZBB: # %bb.0:
; ZBB-NEXT: max a0, a0, a1
; ZBB-NEXT: ret
+;
+; XQCI-LABEL: smax_i32:
+; XQCI: # %bb.0:
+; XQCI-NEXT: qc.mvge a0, a1, a0, a1
+; XQCI-NEXT: ret
+;
+; RV32I-SFB-LABEL: smax_i32:
+; RV32I-SFB: # %bb.0:
+; RV32I-SFB-NEXT: blt a1, a0, .LBB2_2
+; RV32I-SFB-NEXT: # %bb.1:
+; RV32I-SFB-NEXT: mv a0, a1
+; RV32I-SFB-NEXT: .LBB2_2:
+; RV32I-SFB-NEXT: ret
+;
+; RV64I-SFB-LABEL: smax_i32:
+; RV64I-SFB: # %bb.0:
+; RV64I-SFB-NEXT: blt a1, a0, .LBB2_2
+; RV64I-SFB-NEXT: # %bb.1:
+; RV64I-SFB-NEXT: mv a0, a1
+; RV64I-SFB-NEXT: .LBB2_2:
+; RV64I-SFB-NEXT: ret
%c = call i32 @llvm.smax.i32(i32 %a, i32 %b)
ret i32 %c
}
@@ -112,6 +181,41 @@ define i64 @smax_i64(i64 %a, i64 %b) {
; RV64ZBB: # %bb.0:
; RV64ZBB-NEXT: max a0, a0, a1
; RV64ZBB-NEXT: ret
+;
+; XQCI-LABEL: smax_i64:
+; XQCI: # %bb.0:
+; XQCI-NEXT: sltu a4, a2, a0
+; XQCI-NEXT: slt a5, a3, a1
+; XQCI-NEXT: qc.mveq a5, a1, a3, a4
+; XQCI-NEXT: qc.mveqi a0, a5, 0, a2
+; XQCI-NEXT: qc.mveqi a1, a5, 0, a3
+; XQCI-NEXT: ret
+;
+; RV32I-SFB-LABEL: smax_i64:
+; RV32I-SFB: # %bb.0:
+; RV32I-SFB-NEXT: sltu a4, a2, a0
+; RV32I-SFB-NEXT: slt a5, a3, a1
+; RV32I-SFB-NEXT: bne a1, a3, .LBB3_2
+; RV32I-SFB-NEXT: # %bb.1:
+; RV32I-SFB-NEXT: mv a5, a4
+; RV32I-SFB-NEXT: .LBB3_2:
+; RV32I-SFB-NEXT: bnez a5, .LBB3_4
+; RV32I-SFB-NEXT: # %bb.3:
+; RV32I-SFB-NEXT: mv a0, a2
+; RV32I-SFB-NEXT: .LBB3_4:
+; RV32I-SFB-NEXT: bnez a5, .LBB3_6
+; RV32I-SFB-NEXT: # %bb.5:
+; RV32I-SFB-NEXT: mv a1, a3
+; RV32I-SFB-NEXT: .LBB3_6:
+; RV32I-SFB-NEXT: ret
+;
+; RV64I-SFB-LABEL: smax_i64:
+; RV64I-SFB: # %bb.0:
+; RV64I-SFB-NEXT: blt a1, a0, .LBB3_2
+; RV64I-SFB-NEXT: # %bb.1:
+; RV64I-SFB-NEXT: mv a0, a1
+; RV64I-SFB-NEXT: .LBB3_2:
+; RV64I-SFB-NEXT: ret
%c = call i64 @llvm.smax.i64(i64 %a, i64 %b)
ret i64 %c
}
@@ -131,6 +235,27 @@ define signext i8 @smin_i8(i8 signext %a, i8 signext %b) {
; ZBB: # %bb.0:
; ZBB-NEXT: min a0, a0, a1
; ZBB-NEXT: ret
+;
+; XQCI-LABEL: smin_i8:
+; XQCI: # %bb.0:
+; XQCI-NEXT: qc.mvge a0, a0, a1, a1
+; XQCI-NEXT: ret
+;
+; RV32I-SFB-LABEL: smin_i8:
+; RV32I-SFB: # %bb.0:
+; RV32I-SFB-NEXT: blt a0, a1, .LBB4_2
+; RV32I-SFB-NEXT: # %bb.1:
+; RV32I-SFB-NEXT: mv a0, a1
+; RV32I-SFB-NEXT: .LBB4_2:
+; RV32I-SFB-NEXT: ret
+;
+; RV64I-SFB-LABEL: smin_i8:
+; RV64I-SFB: # %bb.0:
+; RV64I-SFB-NEXT: blt a0, a1, .LBB4_2
+; RV64I-SFB-NEXT: # %bb.1:
+; RV64I-SFB-NEXT: mv a0, a1
+; RV64I-SFB-NEXT: .LBB4_2:
+; RV64I-SFB-NEXT: ret
%c = call i8 @llvm.smin.i8(i8 %a, i8 %b)
ret i8 %c
}
@@ -150,6 +275,27 @@ define signext i16 @smin_i16(i16 signext %a, i16 signext %b) {
; ZBB: # %bb.0:
; ZBB-NEXT: min a0, a0, a1
; ZBB-NEXT: ret
+;
+; XQCI-LABEL: smin_i16:
+; XQCI: # %bb.0:
+; XQCI-NEXT: qc.mvge a0, a0, a1, a1
+; XQCI-NEXT: ret
+;
+; RV32I-SFB-LABEL: smin_i16:
+; RV32I-SFB: # %bb.0:
+; RV32I-SFB-NEXT: blt a0, a1, .LBB5_2
+; RV32I-SFB-NEXT: # %bb.1:
+; RV32I-SFB-NEXT: mv a0, a1
+; RV32I-SFB-NEXT: .LBB5_2:
+; RV32I-SFB-NEXT: ret
+;
+; RV64I-SFB-LABEL: smin_i16:
+; RV64I-SFB: # %bb.0:
+; RV64I-SFB-NEXT: blt a0, a1, .LBB5_2
+; RV64I-SFB-NEXT: # %bb.1:
+; RV64I-SFB-NEXT: mv a0, a1
+; RV64I-SFB-NEXT: .LBB5_2:
+; RV64I-SFB-NEXT: ret
%c = call i16 @llvm.smin.i16(i16 %a, i16 %b)
ret i16 %c
}
@@ -169,6 +315,27 @@ define signext i32 @smin_i32(i32 signext %a, i32 signext %b) {
; ZBB: # %bb.0:
; ZBB-NEXT: min a0, a0, a1
; ZBB-NEXT: ret
+;
+; XQCI-LABEL: smin_i32:
+; XQCI: # %bb.0:
+; XQCI-NEXT: qc.mvge a0, a0, a1, a1
+; XQCI-NEXT: ret
+;
+; RV32I-SFB-LABEL: smin_i32:
+; RV32I-SFB: # %bb.0:
+; RV32I-SFB-NEXT: blt a0, a1, .LBB6_2
+; RV32I-SFB-NEXT: # %bb.1:
+; RV32I-SFB-NEXT: mv a0, a1
+; RV32I-SFB-NEXT: .LBB6_2:
+; RV32I-SFB-NEXT: ret
+;
+; RV64I-SFB-LABEL: smin_i32:
+; RV64I-SFB: # %bb.0:
+; RV64I-SFB-NEXT: blt a0, a1, .LBB6_2
+; RV64I-SFB-NEXT: # %bb.1:
+; RV64I-SFB-NEXT: mv a0, a1
+; RV64I-SFB-NEXT: .LBB6_2:
+; RV64I-SFB-NEXT: ret
%c = call i32 @llvm.smin.i32(i32 %a, i32 %b)
ret i32 %c
}
@@ -220,6 +387,41 @@ define i64 @smin_i64(i64 %a, i64 %b) {
; RV64ZBB: # %bb.0:
; RV64ZBB-NEXT: min a0, a0, a1
; RV64ZBB-NEXT: ret
+;
+; XQCI-LABEL: smin_i64:
+; XQCI: # %bb.0:
+; XQCI-NEXT: sltu a4, a0, a2
+; XQCI-NEXT: slt a5, a1, a3
+; XQCI-NEXT: qc.mveq a5, a1, a3, a4
+; XQCI-NEXT: qc.mveqi a0, a5, 0, a2
+; XQCI-NEXT: qc.mveqi a1, a5, 0, a3
+; XQCI-NEXT: ret
+;
+; RV32I-SFB-LABEL: smin_i64:
+; RV32I-SFB: # %bb.0:
+; RV32I-SFB-NEXT: sltu a4, a0, a2
+; RV32I-SFB-NEXT: slt a5, a1, a3
+; RV32I-SFB-NEXT: bne a1, a3, .LBB7_2
+; RV32I-SFB-NEXT: # %bb.1:
+; RV32I-SFB-NEXT: mv a5, a4
+; RV32I-SFB-NEXT: .LBB7_2:
+; RV32I-SFB-NEXT: bnez a5, .LBB7_4
+; RV32I-SFB-NEXT: # %bb.3:
+; RV32I-SFB-NEXT: mv a0, a2
+; RV32I-SFB-NEXT: .LBB7_4:
+; RV32I-SFB-NEXT: bnez a5, .LBB7_6
+; RV32I-SFB-NEXT: # %bb.5:
+; RV32I-SFB-NEXT: mv a1, a3
+; RV32I-SFB-NEXT: .LBB7_6:
+; RV32I-SFB-NEXT: ret
+;
+; RV64I-SFB-LABEL: smin_i64:
+; RV64I-SFB: # %bb.0:
+; RV64I-SFB-NEXT: blt a0, a1, .LBB7_2
+; RV64I-SFB-NEXT: # %bb.1:
+; RV64I-SFB-NEXT: mv a0, a1
+; RV64I-SFB-NEXT: .LBB7_2:
+; RV64I-SFB-NEXT: ret
%c = call i64 @llvm.smin.i64(i64 %a, i64 %b)
ret i64 %c
}
@@ -239,6 +441,27 @@ define i8 @umax_i8(i8 zeroext %a, i8 zeroext %b) {
; ZBB: # %bb.0:
; ZBB-NEXT: maxu a0, a0, a1
; ZBB-NEXT: ret
+;
+; XQCI-LABEL: umax_i8:
+; XQCI: # %bb.0:
+; XQCI-NEXT: qc.mvgeu a0, a1, a0, a1
+; XQCI-NEXT: ret
+;
+; RV32I-SFB-LABEL: umax_i8:
+; RV32I-SFB: # %bb.0:
+; RV32I-SFB-NEXT: bltu a1, a0, .LBB8_2
+; RV32I-SFB-NEXT: # %bb.1:
+; RV32I-SFB-NEXT: mv a0, a1
+; RV32I-SFB-NEXT: .LBB8_2:
+; RV32I-SFB-NEXT: ret
+;
+; RV64I-SFB-LABEL: umax_i8:
+; RV64I-SFB: # %bb.0:
+; RV64I-SFB-NEXT: bltu a1, a0, .LBB8_2
+; RV64I-SFB-NEXT: # %bb.1:
+; RV64I-SFB-NEXT: mv a0, a1
+; RV64I-SFB-NEXT: .LBB8_2:
+; RV64I-SFB-NEXT: ret
%c = call i8 @llvm.umax.i8(i8 %a, i8 %b)
ret i8 %c
}
@@ -258,6 +481,27 @@ define i16 @umax_i16(i16 zeroext %a, i16 zeroext %b) {
; ZBB: # %bb.0:
; ZBB-NEXT: maxu a0, a0, a1
; ZBB-NEXT: ret
+;
+; XQCI-LABEL: umax_i16:
+; XQCI: # %bb.0:
+; XQCI-NEXT: qc.mvgeu a0, a1, a0, a1
+; XQCI-NEXT: ret
+;
+; RV32I-SFB-LABEL: umax_i16:
+; RV32I-SFB: # %bb.0:
+; RV32I-SFB-NEXT: bltu a1, a0, .LBB9_2
+; RV32I-SFB-NEXT: # %bb.1:
+; RV32I-SFB-NEXT: mv a0, a1
+; RV32I-SFB-NEXT: .LBB9_2:
+; RV32I-SFB-NEXT: ret
+;
+; RV64I-SFB-LABEL: umax_i16:
+; RV64I-SFB: # %bb.0:
+; RV64I-SFB-NEXT: bltu a1, a0, .LBB9_2
+; RV64I-SFB-NEXT: # %bb.1:
+; RV64I-SFB-NEXT: mv a0, a1
+; RV64I-SFB-NEXT: .LBB9_2:
+; RV64I-SFB-NEXT: ret
%c = call i16 @llvm.umax.i16(i16 %a, i16 %b)
ret i16 %c
}
@@ -277,6 +521,27 @@ define signext i32 @umax_i32(i32 signext %a, i32 signext %b) {
; ZBB: # %bb.0:
; ZBB-NEXT: maxu a0, a0, a1
; ZBB-NEXT: ret
+;
+; XQCI-LABEL: umax_i32:
+; XQCI: # %bb.0:
+; XQCI-NEXT: qc.mvgeu a0, a1, a0, a1
+; XQCI-NEXT: ret
+;
+; RV32I-SFB-LABEL: umax_i32:
+; RV32I-SFB: # %bb.0:
+; RV32I-SFB-NEXT: bltu a1, a0, .LBB10_2
+; RV32I-SFB-NEXT: # %bb.1:
+; RV32I-SFB-NEXT: mv a0, a1
+; RV32I-SFB-NEXT: .LBB10_2:
+; RV32I-SFB-NEXT: ret
+;
+; RV64I-SFB-LABEL: umax_i32:
+; RV64I-SFB: # %bb.0:
+; RV64I-SFB-NEXT: bltu a1, a0, .LBB10_2
+; RV64I-SFB-NEXT: # %bb.1:
+; RV64I-SFB-NEXT: mv a0, a1
+; RV64I-SFB-NEXT: .LBB10_2:
+; RV64I-SFB-NEXT: ret
%c = call i32 @llvm.umax.i32(i32 %a, i32 %b)
ret i32 %c
}
@@ -328,6 +593,41 @@ define i64 @umax_i64(i64 %a, i64 %b) {
; RV64ZBB: # %bb.0:
; RV64ZBB-NEXT: maxu a0, a0, a1
; RV64ZBB-NEXT: ret
+;
+; XQCI-LABEL: umax_i64:
+; XQCI: # %bb.0:
+; XQCI-NEXT: sltu a4, a2, a0
+; XQCI-NEXT: sltu a5, a3, a1
+; XQCI-NEXT: qc.mveq a5, a1, a3, a4
+; XQCI-NEXT: qc.mveqi a0, a5, 0, a2
+; XQCI-NEXT: qc.mveqi a1, a5, 0, a3
+; XQCI-NEXT: ret
+;
+; RV32I-SFB-LABEL: umax_i64:
+; RV32I-SFB: # %bb.0:
+; RV32I-SFB-NEXT: sltu a4, a2, a0
+; RV32I-SFB-NEXT: sltu a5, a3, a1
+; RV32I-SFB-NEXT: bne a1, a3, .LBB11_2
+; RV32I-SFB-NEXT: # %bb.1:
+; RV32I-SFB-NEXT: mv a5, a4
+; RV32I-SFB-NEXT: .LBB11_2:
+; RV32I-SFB-NEXT: bnez a5, .LBB11_4
+; RV32I-SFB-NEXT: # %bb.3:
+; RV32I-SFB-NEXT: mv a0, a2
+; RV32I-SFB-NEXT: .LBB11_4:
+; RV32I-SFB-NEXT: bnez a5, .LBB11_6
+; RV32I-SFB-NEXT: # %bb.5:
+; RV32I-SFB-NEXT: mv a1, a3
+; RV32I-SFB-NEXT: .LBB11_6:
+; RV32I-SFB-NEXT: ret
+;
+; RV64I-SFB-LABEL: umax_i64:
+; RV64I-SFB: # %bb.0:
+; RV64I-SFB-NEXT: bltu a1, a0, .LBB11_2
+; RV64I-SFB-NEXT: # %bb.1:
+; RV64I-SFB-NEXT: mv a0, a1
+; RV64I-SFB-NEXT: .LBB11_2:
+; RV64I-SFB-NEXT: ret
%c = call i64 @llvm.umax.i64(i64 %a, i64 %b)
ret i64 %c
}
@@ -347,6 +647,27 @@ define zeroext i8 @umin_i8(i8 zeroext %a, i8 zeroext %b) {
; ZBB: # %bb.0:
; ZBB-NEXT: minu a0, a0, a1
; ZBB-NEXT: ret
+;
+; XQCI-LABEL: umin_i8:
+; XQCI: # %bb.0:
+; XQCI-NEXT: qc.mvgeu a0, a0, a1, a1
+; XQCI-NEXT: ret
+;
+; RV32I-SFB-LABEL: umin_i8:
+; RV32I-SFB: # %bb.0:
+; RV32I-SFB-NEXT: bltu a0, a1, .LBB12_2
+; RV32I-SFB-NEXT: # %bb.1:
+; RV32I-SFB-NEXT: mv a0, a1
+; RV32I-SFB-NEXT: .LBB12_2:
+; RV32I-SFB-NEXT: ret
+;
+; RV64I-SFB-LABEL: umin_i8:
+; RV64I-SFB: # %bb.0:
+; RV64I-SFB-NEXT: bltu a0, a1, .LBB12_2
+; RV64I-SFB-NEXT: # %bb.1:
+; RV64I-SFB-NEXT: mv a0, a1
+; RV64I-SFB-NEXT: .LBB12_2:
+; RV64I-SFB-NEXT: ret
%c = call i8 @llvm.umin.i8(i8 %a, i8 %b)
ret i8 %c
}
@@ -366,6 +687,27 @@ define zeroext i16 @umin_i16(i16 zeroext %a, i16 zeroext %b) {
; ZBB: # %bb.0:
; ZBB-NEXT: minu a0, a0, a1
; ZBB-NEXT: ret
+;
+; XQCI-LABEL: umin_i16:
+; XQCI: # %bb.0:
+; XQCI-NEXT: qc.mvgeu a0, a0, a1, a1
+; XQCI-NEXT: ret
+;
+; RV32I-SFB-LABEL: umin_i16:
+; RV32I-SFB: # %bb.0:
+; RV32I-SFB-NEXT: bltu a0, a1, .LBB13_2
+; RV32I-SFB-NEXT: # %bb.1:
+; RV32I-SFB-NEXT: mv a0, a1
+; RV32I-SFB-NEXT: .LBB13_2:
+; RV32I-SFB-NEXT: ret
+;
+; RV64I-SFB-LABEL: umin_i16:
+; RV64I-SFB: # %bb.0:
+; RV64I-SFB-NEXT: bltu a0, a1, .LBB13_2
+; RV64I-SFB-NEXT: # %bb.1:
+; RV64I-SFB-NEXT: mv a0, a1
+; RV64I-SFB-NEXT: .LBB13_2:
+; RV64I-SFB-NEXT: ret
%c = call i16 @llvm.umin.i16(i16 %a, i16 %b)
ret i16 %c
}
@@ -385,6 +727,27 @@ define signext i32 @umin_i32(i32 signext %a, i32 signext %b) {
; ZBB: # %bb.0:
; ZBB-NEXT: minu a0, a0, a1
; ZBB-NEXT: ret
+;
+; XQCI-LABEL: umin_i32:
+; XQCI: # %bb.0:
+; XQCI-NEXT: qc.mvgeu a0, a0, a1, a1
+; XQCI-NEXT: ret
+;
+; RV32I-SFB-LABEL: umin_i32:
+; RV32I-SFB: # %bb.0:
+; RV32I-SFB-NEXT: bltu a0, a1, .LBB14_2
+; RV32I-SFB-NEXT: # %bb.1:
+; RV32I-SFB-NEXT: mv a0, a1
+; RV32I-SFB-NEXT: .LBB14_2:
+; RV32I-SFB-NEXT: ret
+;
+; RV64I-SFB-LABEL: umin_i32:
+; RV64I-SFB: # %bb.0:
+; RV64I-SFB-NEXT: bltu a0, a1, .LBB14_2
+; RV64I-SFB-NEXT: # %bb.1:
+; RV64I-SFB-NEXT: mv a0, a1
+; RV64I-SFB-NEXT: .LBB14_2:
+; RV64I-SFB-NEXT: ret
%c = call i32 @llvm.umin.i32(i32 %a, i32 %b)
ret i32 %c
}
@@ -436,6 +799,41 @@ define i64 @umin_i64(i64 %a, i64 %b) {
; RV64ZBB: # %bb.0:
; RV64ZBB-NEXT: minu a0, a0, a1
; RV64ZBB-NEXT: ret
+;
+; XQCI-LABEL: umin_i64:
+; XQCI: # %bb.0:
+; XQCI-NEXT: sltu a4, a0, a2
+; XQCI-NEXT: sltu a5, a1, a3
+; XQCI-NEXT: qc.mveq a5, a1, a3, a4
+; XQCI-NEXT: qc.mveqi a0, a5, 0, a2
+; XQCI-NEXT: qc.mveqi a1, a5, 0, a3
+; XQCI-NEXT: ret
+;
+; RV32I-SFB-LABEL: umin_i64:
+; RV32I-SFB: # %bb.0:
+; RV32I-SFB-NEXT: sltu a4, a0, a2
+; RV32I-SFB-NEXT: sltu a5, a1, a3
+; RV32I-SFB-NEXT: bne a1, a3, .LBB15_2
+; RV32I-SFB-NEXT: # %bb.1:
+; RV32I-SFB-NEXT: mv a5, a4
+; RV32I-SFB-NEXT: .LBB15_2:
+; RV32I-SFB-NEXT: bnez a5, .LBB15_4
+; RV32I-SFB-NEXT: # %bb.3:
+; RV32I-SFB-NEXT: mv a0, a2
+; RV32I-SFB-NEXT: .LBB15_4:
+; RV32I-SFB-NEXT: bnez a5, .LBB15_6
+; RV32I-SFB-NEXT: # %bb.5:
+; RV32I-SFB-NEXT: mv a1, a3
+; RV32I-SFB-NEXT: .LBB15_6:
+; RV32I-SFB-NEXT: ret
+;
+; RV64I-SFB-LABEL: umin_i64:
+; RV64I-SFB: # %bb.0:
+; RV64I-SFB-NEXT: bltu a0, a1, .LBB15_2
+; RV64I-SFB-NEXT: # %bb.1:
+; RV64I-SFB-NEXT: mv a0, a1
+; RV64I-SFB-NEXT: .LBB15_2:
+; RV64I-SFB-NEXT: ret
%c = call i64 @llvm.umin.i64(i64 %a, i64 %b)
ret i64 %c
}
@@ -450,6 +848,18 @@ define signext i32 @smin_same_op_i32(i32 signext %a) {
; ZBB-LABEL: smin_same_op_i32:
; ZBB: # %bb.0:
; ZBB-NEXT: ret
+;
+; XQCI-LABEL: smin_same_op_i32:
+; XQCI: # %bb.0:
+; XQCI-NEXT: ret
+;
+; RV32I-SFB-LABEL: smin_same_op_i32:
+; RV32I-SFB: # %bb.0:
+; RV32I-SFB-NEXT: ret
+;
+; RV64I-SFB-LABEL: smin_same_op_i32:
+; RV64I-SFB: # %bb.0:
+; RV64I-SFB-NEXT: ret
%c = call i32 @llvm.smin.i32(i32 %a, i32 %a)
ret i32 %c
}
@@ -462,6 +872,18 @@ define signext i32 @smax_same_op_i32(i32 signext %a) {
; ZBB-LABEL: smax_same_op_i32:
; ZBB: # %bb.0:
; ZBB-NEXT: ret
+;
+; XQCI-LABEL: smax_same_op_i32:
+; XQCI: # %bb.0:
+; XQCI-NEXT: ret
+;
+; RV32I-SFB-LABEL: smax_same_op_i32:
+; RV32I-SFB: # %bb.0:
+; RV32I-SFB-NEXT: ret
+;
+; RV64I-SFB-LABEL: smax_same_op_i32:
+; RV64I-SFB: # %bb.0:
+; RV64I-SFB-NEXT: ret
%c = call i32 @llvm.smax.i32(i32 %a, i32 %a)
ret i32 %c
}
@@ -474,6 +896,18 @@ define signext i32 @umin_same_op_i32(i32 signext %a) {
; ZBB-LABEL: umin_same_op_i32:
; ZBB: # %bb.0:
; ZBB-NEXT: ret
+;
+; XQCI-LABEL: umin_same_op_i32:
+; XQCI: # %bb.0:
+; XQCI-NEXT: ret
+;
+; RV32I-SFB-LABEL: umin_same_op_i32:
+; RV32I-SFB: # %bb.0:
+; RV32I-SFB-NEXT: ret
+;
+; RV64I-SFB-LABEL: umin_same_op_i32:
+; RV64I-SFB: # %bb.0:
+; RV64I-SFB-NEXT: ret
%c = call i32 @llvm.umin.i32(i32 %a, i32 %a)
ret i32 %c
}
@@ -486,6 +920,18 @@ define signext i32 @umax_same_op_i32(i32 signext %a) {
; ZBB-LABEL: umax_same_op_i32:
; ZBB: # %bb.0:
; ZBB-NEXT: ret
+;
+; XQCI-LABEL: umax_same_op_i32:
+; XQCI: # %bb.0:
+; XQCI-NEXT: ret
+;
+; RV32I-SFB-LABEL: umax_same_op_i32:
+; RV32I-SFB: # %bb.0:
+; RV32I-SFB-NEXT: ret
+;
+; RV64I-SFB-LABEL: umax_same_op_i32:
+; RV64I-SFB: # %bb.0:
+; RV64I-SFB-NEXT: ret
%c = call i32 @llvm.umax.i32(i32 %a, i32 %a)
ret i32 %c
}
@@ -510,6 +956,19 @@ define signext i32 @smin_undef_i32() {
; RV64ZBB: # %bb.0:
; RV64ZBB-NEXT: li a0, 0
; RV64ZBB-NEXT: ret
+;
+; XQCI-LABEL: smin_undef_i32:
+; XQCI: # %bb.0:
+; XQCI-NEXT: ret
+;
+; RV32I-SFB-LABEL: smin_undef_i32:
+; RV32I-SFB: # %bb.0:
+; RV32I-SFB-NEXT: ret
+;
+; RV64I-SFB-LABEL: smin_undef_i32:
+; RV64I-SFB: # %bb.0:
+; RV64I-SFB-NEXT: li a0, 0
+; RV64I-SFB-NEXT: ret
%c = call i32 @llvm.smin.i32(i32 undef, i32 undef)
ret i32 %c
}
@@ -532,6 +991,19 @@ define signext i32 @smax_undef_i32() {
; RV64ZBB: # %bb.0:
; RV64ZBB-NEXT: li a0, 0
; RV64ZBB-NEXT: ret
+;
+; XQCI-LABEL: smax_undef_i32:
+; XQCI: # %bb.0:
+; XQCI-NEXT: ret
+;
+; RV32I-SFB-LABEL: smax_undef_i32:
+; RV32I-SFB: # %bb.0:
+; RV32I-SFB-NEXT: ret
+;
+; RV64I-SFB-LABEL: smax_undef_i32:
+; RV64I-SFB: # %bb.0:
+; RV64I-SFB-NEXT: li a0, 0
+; RV64I-SFB-NEXT: ret
%c = call i32 @llvm.smax.i32(i32 undef, i32 undef)
ret i32 %c
}
@@ -554,6 +1026,19 @@ define signext i32 @umin_undef_i32() {
; RV64ZBB: # %bb.0:
; RV64ZBB-NEXT: li a0, 0
; RV64ZBB-NEXT: ret
+;
+; XQCI-LABEL: umin_undef_i32:
+; XQCI: # %bb.0:
+; XQCI-NEXT: ret
+;
+; RV32I-SFB-LABEL: umin_undef_i32:
+; RV32I-SFB: # %bb.0:
+; RV32I-SFB-NEXT: ret
+;
+; RV64I-SFB-LABEL: umin_undef_i32:
+; RV64I-SFB: # %bb.0:
+; RV64I-SFB-NEXT: li a0, 0
+; RV64I-SFB-NEXT: ret
%c = call i32 @llvm.umin.i32(i32 undef, i32 undef)
ret i32 %c
}
@@ -576,6 +1061,19 @@ define signext i32 @umax_undef_i32() {
; RV64ZBB: # %bb.0:
; RV64ZBB-NEXT: li a0, 0
; RV64ZBB-NEXT: ret
+;
+; XQCI-LABEL: umax_undef_i32:
+; XQCI: # %bb.0:
+; XQCI-NEXT: ret
+;
+; RV32I-SFB-LABEL: umax_undef_i32:
+; RV32I-SFB: # %bb.0:
+; RV32I-SFB-NEXT: ret
+;
+; RV64I-SFB-LABEL: umax_undef_i32:
+; RV64I-SFB: # %bb.0:
+; RV64I-SFB-NEXT: li a0, 0
+; RV64I-SFB-NEXT: ret
%c = call i32 @llvm.umax.i32(i32 undef, i32 undef)
ret i32 %c
}
@@ -595,6 +1093,29 @@ define signext i32 @smax_i32_pos_constant(i32 signext %a) {
; ZBB-NEXT: li a1, 10
; ZBB-NEXT: max a0, a0, a1
; ZBB-NEXT: ret
+;
+; XQCI-LABEL: smax_i32_pos_constant:
+; XQCI: # %bb.0:
+; XQCI-NEXT: qc.lilti a0, a0, 11, 10
+; XQCI-NEXT: ret
+;
+; RV32I-SFB-LABEL: smax_i32_pos_constant:
+; RV32I-SFB: # %bb.0:
+; RV32I-SFB-NEXT: li a1, 10
+; RV32I-SFB-NEXT: blt a1, a0, .LBB24_2
+; RV32I-SFB-NEXT: # %bb.1:
+; RV32I-SFB-NEXT: mv a0, a1
+; RV32I-SFB-NEXT: .LBB24_2:
+; RV32I-SFB-NEXT: ret
+;
+; RV64I-SFB-LABEL: smax_i32_pos_constant:
+; RV64I-SFB: # %bb.0:
+; RV64I-SFB-NEXT: li a1, 10
+; RV64I-SFB-NEXT: blt a1, a0, .LBB24_2
+; RV64I-SFB-NEXT: # %bb.1:
+; RV64I-SFB-NEXT: mv a0, a1
+; RV64I-SFB-NEXT: .LBB24_2:
+; RV64I-SFB-NEXT: ret
%c = call i32 @llvm.smax.i32(i32 %a, i32 10)
ret i32 %c
}
@@ -616,6 +1137,33 @@ define signext i32 @smax_i32_pos_constant_trailing_zeros(i32 signext %a) {
; ZBB-NEXT: li a1, 16
; ZBB-NEXT: max a0, a0, a1
; ZBB-NEXT: ret
+;
+; XQCI-LABEL: smax_i32_pos_constant_trailing_zeros:
+; XQCI: # %bb.0:
+; XQCI-NEXT: andi a1, a0, -8
+; XQCI-NEXT: li a0, 16
+; XQCI-NEXT: qc.mvlt a0, a0, a1, a1
+; XQCI-NEXT: ret
+;
+; RV32I-SFB-LABEL: smax_i32_pos_constant_trailing_zeros:
+; RV32I-SFB: # %bb.0:
+; RV32I-SFB-NEXT: andi a1, a0, -8
+; RV32I-SFB-NEXT: li a0, 16
+; RV32I-SFB-NEXT: bge a0, a1, .LBB25_2
+; RV32I-SFB-NEXT: # %bb.1:
+; RV32I-SFB-NEXT: mv a0, a1
+; RV32I-SFB-NEXT: .LBB25_2:
+; RV32I-SFB-NEXT: ret
+;
+; RV64I-SFB-LABEL: smax_i32_pos_constant_trailing_zeros:
+; RV64I-SFB: # %bb.0:
+; RV64I-SFB-NEXT: andi a1, a0, -8
+; RV64I-SFB-NEXT: li a0, 16
+; RV64I-SFB-NEXT: bge a0, a1, .LBB25_2
+; RV64I-SFB-NEXT: # %bb.1:
+; RV64I-SFB-NEXT: mv a0, a1
+; RV64I-SFB-NEXT: .LBB25_2:
+; RV64I-SFB-NEXT: ret
%b = and i32 %a, -8
%c = call i32 @llvm.smax.i32(i32 %b, i32 16)
%d = and i32 %c, -4
@@ -635,6 +1183,29 @@ define signext i32 @smin_i32_negone(i32 signext %a) {
; ZBB-NEXT: li a1, -1
; ZBB-NEXT: min a0, a0, a1
; ZBB-NEXT: ret
+;
+; XQCI-LABEL: smin_i32_negone:
+; XQCI: # %bb.0:
+; XQCI-NEXT: qc.ligei a0, a0, 0, -1
+; XQCI-NEXT: ret
+;
+; RV32I-SFB-LABEL: smin_i32_negone:
+; RV32I-SFB: # %bb.0:
+; RV32I-SFB-NEXT: li a1, -1
+; RV32I-SFB-NEXT: bltz a0, .LBB26_2
+; RV32I-SFB-NEXT: # %bb.1:
+; RV32I-SFB-NEXT: mv a0, a1
+; RV32I-SFB-NEXT: .LBB26_2:
+; RV32I-SFB-NEXT: ret
+;
+; RV64I-SFB-LABEL: smin_i32_negone:
+; RV64I-SFB: # %bb.0:
+; RV64I-SFB-NEXT: li a1, -1
+; RV64I-SFB-NEXT: bltz a0, .LBB26_2
+; RV64I-SFB-NEXT: # %bb.1:
+; RV64I-SFB-NEXT: mv a0, a1
+; RV64I-SFB-NEXT: .LBB26_2:
+; RV64I-SFB-NEXT: ret
%c = call i32 @llvm.smin.i32(i32 %a, i32 -1)
ret i32 %c
}
@@ -672,6 +1243,34 @@ define i64 @smin_i64_negone(i64 %a) {
; RV64ZBB-NEXT: li a1, -1
; RV64ZBB-NEXT: min a0, a0, a1
; RV64ZBB-NEXT: ret
+;
+; XQCI-LABEL: smin_i64_negone:
+; XQCI: # %bb.0:
+; XQCI-NEXT: qc.ligei a0, a1, 0, -1
+; XQCI-NEXT: qc.ligei a1, a1, 0, -1
+; XQCI-NEXT: ret
+;
+; RV32I-SFB-LABEL: smin_i64_negone:
+; RV32I-SFB: # %bb.0:
+; RV32I-SFB-NEXT: li a2, -1
+; RV32I-SFB-NEXT: bltz a1, .LBB27_2
+; RV32I-SFB-NEXT: # %bb.1:
+; RV32I-SFB-NEXT: mv a0, a2
+; RV32I-SFB-NEXT: .LBB27_2:
+; RV32I-SFB-NEXT: bltz a1, .LBB27_4
+; RV32I-SFB-NEXT: # %bb.3:
+; RV32I-SFB-NEXT: mv a1, a2
+; RV32I-SFB-NEXT: .LBB27_4:
+; RV32I-SFB-NEXT: ret
+;
+; RV64I-SFB-LABEL: smin_i64_negone:
+; RV64I-SFB: # %bb.0:
+; RV64I-SFB-NEXT: li a1, -1
+; RV64I-SFB-NEXT: bltz a0, .LBB27_2
+; RV64I-SFB-NEXT: # %bb.1:
+; RV64I-SFB-NEXT: mv a0, a1
+; RV64I-SFB-NEXT: .LBB27_2:
+; RV64I-SFB-NEXT: ret
%c = call i64 @llvm.smin.i64(i64 %a, i64 -1)
ret i64 %c
}
@@ -720,6 +1319,41 @@ define i64 @umax_i64_one(i64 %a, i64 %b) {
; RV64ZBB-NEXT: li a1, 1
; RV64ZBB-NEXT: maxu a0, a0, a1
; RV64ZBB-NEXT: ret
+;
+; XQCI-LABEL: umax_i64_one:
+; XQCI: # %bb.0:
+; XQCI-NEXT: mv a2, a1
+; XQCI-NEXT: qc.selectinei a2, 0, a0, 1
+; XQCI-NEXT: qc.liltui a0, a0, 2, 1
+; XQCI-NEXT: qc.mvnei a0, a1, 0, a2
+; XQCI-NEXT: ret
+;
+; RV32I-SFB-LABEL: umax_i64_one:
+; RV32I-SFB: # %bb.0:
+; RV32I-SFB-NEXT: li a2, 1
+; RV32I-SFB-NEXT: li a3, 1
+; RV32I-SFB-NEXT: beqz a1, .LBB28_2
+; RV32I-SFB-NEXT: # %bb.1:
+; RV32I-SFB-NEXT: mv a3, a0
+; RV32I-SFB-NEXT: .LBB28_2:
+; RV32I-SFB-NEXT: bnez a0, .LBB28_4
+; RV32I-SFB-NEXT: # %bb.3:
+; RV32I-SFB-NEXT: mv a0, a2
+; RV32I-SFB-NEXT: .LBB28_4:
+; RV32I-SFB-NEXT: beqz a1, .LBB28_6
+; RV32I-SFB-NEXT: # %bb.5:
+; RV32I-SFB-NEXT: mv a0, a3
+; RV32I-SFB-NEXT: .LBB28_6:
+; RV32I-SFB-NEXT: ret
+;
+; RV64I-SFB-LABEL: umax_i64_one:
+; RV64I-SFB: # %bb.0:
+; RV64I-SFB-NEXT: li a1, 1
+; RV64I-SFB-NEXT: bnez a0, .LBB28_2
+; RV64I-SFB-NEXT: # %bb.1:
+; RV64I-SFB-NEXT: mv a0, a1
+; RV64I-SFB-NEXT: .LBB28_2:
+; RV64I-SFB-NEXT: ret
%c = call i64 @llvm.umax.i64(i64 %a, i64 1)
ret i64 %c
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/regcoal-liveinterval-pruning-crash.ll b/llvm/test/CodeGen/RISCV/rvv/regcoal-liveinterval-pruning-crash.ll
new file mode 100644
index 0000000..c19e93d
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/regcoal-liveinterval-pruning-crash.ll
@@ -0,0 +1,76 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -O1 -mtriple=riscv64 -mattr=+v < %s | FileCheck %s
+
+define i32 @pr134424(i64 %input_value, i32 %base_value, i1 %cond_flag1, i1 %cond_flag2, i1 %cond_flag3) {
+; CHECK-LABEL: pr134424:
+; CHECK: # %bb.0: # %for.body.us.preheader.i
+; CHECK-NEXT: andi a3, a3, 1
+; CHECK-NEXT: andi a5, a2, 1
+; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vsetvli zero, zero, e64, m2, tu, ma
+; CHECK-NEXT: vmv.s.x v8, zero
+; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-NEXT: vmv.v.i v0, 14
+; CHECK-NEXT: mv a2, a1
+; CHECK-NEXT: bnez a5, .LBB0_2
+; CHECK-NEXT: # %bb.1: # %for.body.us.preheader.i
+; CHECK-NEXT: li a2, 1
+; CHECK-NEXT: .LBB0_2: # %for.body.us.preheader.i
+; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0
+; CHECK-NEXT: andi a4, a4, 1
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: bnez a3, .LBB0_4
+; CHECK-NEXT: # %bb.3: # %for.body.us.preheader.i
+; CHECK-NEXT: li a0, 1
+; CHECK-NEXT: .LBB0_4: # %for.body.us.preheader.i
+; CHECK-NEXT: vmsle.vi v0, v8, 0
+; CHECK-NEXT: sext.w a2, a2
+; CHECK-NEXT: bnez a4, .LBB0_6
+; CHECK-NEXT: # %bb.5: # %for.body.us.preheader.i
+; CHECK-NEXT: li a1, 1
+; CHECK-NEXT: .LBB0_6: # %for.body.us.preheader.i
+; CHECK-NEXT: sext.w a0, a0
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vredmin.vs v8, v8, v8
+; CHECK-NEXT: vmv.x.s a3, v8
+; CHECK-NEXT: sext.w a1, a1
+; CHECK-NEXT: bge a3, a2, .LBB0_11
+; CHECK-NEXT: # %bb.7: # %for.body.us.preheader.i
+; CHECK-NEXT: bge a0, a1, .LBB0_12
+; CHECK-NEXT: .LBB0_8: # %for.body.us.preheader.i
+; CHECK-NEXT: blt a3, a0, .LBB0_10
+; CHECK-NEXT: .LBB0_9: # %for.body.us.preheader.i
+; CHECK-NEXT: mv a3, a0
+; CHECK-NEXT: .LBB0_10: # %for.body.us.preheader.i
+; CHECK-NEXT: sw a3, 0(zero)
+; CHECK-NEXT: li a0, 0
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB0_11: # %for.body.us.preheader.i
+; CHECK-NEXT: mv a3, a2
+; CHECK-NEXT: blt a0, a1, .LBB0_8
+; CHECK-NEXT: .LBB0_12: # %for.body.us.preheader.i
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: bge a3, a0, .LBB0_9
+; CHECK-NEXT: j .LBB0_10
+for.body.us.preheader.i:
+ %partial_vector = insertelement <4 x i64> zeroinitializer, i64 %input_value, i64 1
+ %comparison_vector = shufflevector <4 x i64> %partial_vector, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
+ %comparison_result = icmp sle <4 x i64> %comparison_vector, zeroinitializer
+ %selected_value1 = select i1 %cond_flag1, i32 %base_value, i32 1
+ %selected_value2 = select i1 %cond_flag2, i32 %base_value, i32 1
+ %selected_value3 = select i1 %cond_flag3, i32 %base_value, i32 1
+ %bool_to_int = zext <4 x i1> %comparison_result to <4 x i32>
+ %extended_vector = shufflevector <4 x i32> %bool_to_int, <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %vector_min = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %extended_vector)
+ %min1 = call i32 @llvm.smin.i32(i32 %vector_min, i32 %selected_value1)
+ %min2 = call i32 @llvm.smin.i32(i32 %selected_value2, i32 %selected_value3)
+ %final_min = call i32 @llvm.smin.i32(i32 %min1, i32 %min2)
+ store i32 %final_min, ptr null, align 4
+ ret i32 0
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/regcoal-liveinterval-pruning-crash.mir b/llvm/test/CodeGen/RISCV/rvv/regcoal-liveinterval-pruning-crash.mir
new file mode 100644
index 0000000..aeab8f6
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/regcoal-liveinterval-pruning-crash.mir
@@ -0,0 +1,57 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=riscv64 -mattr=+v -run-pass=register-coalescer -o - %s | FileCheck %s
+
+---
+name: pr71023
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: pr71023
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000)
+ ; CHECK-NEXT: liveins: $x10, $v8, $v10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: dead [[DEF:%[0-9]+]]:gpr = IMPLICIT_DEF
+ ; CHECK-NEXT: undef [[PseudoVMV_V_I_M1_:%[0-9]+]].sub_vrm1_2:vrn8m1 = PseudoVMV_V_I_M1 undef [[PseudoVMV_V_I_M1_]].sub_vrm1_2, 0, -1, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: [[PseudoVMV_V_I_M1_:%[0-9]+]].sub_vrm1_6:vrn8m1 = COPY undef [[PseudoVMV_V_I_M1_]].sub_vrm1_2
+ ; CHECK-NEXT: BNE undef [[DEF]], $x0, %bb.3
+ ; CHECK-NEXT: PseudoBR %bb.1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: BNE undef [[DEF]], $x0, %bb.3
+ ; CHECK-NEXT: PseudoBR %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: successors: %bb.3(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.3:
+ ; CHECK-NEXT: dead [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF
+ ; CHECK-NEXT: early-clobber [[PseudoVMV_V_I_M1_]].sub_vrm1_0:vrn8m1 = PseudoVRGATHER_VI_M1 undef [[PseudoVMV_V_I_M1_]].sub_vrm1_0, [[PseudoVMV_V_I_M1_]].sub_vrm1_2, 0, 0, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: PseudoVSSEG6E8_V_M1_MASK [[PseudoVMV_V_I_M1_]].sub_vrm1_0_sub_vrm1_1_sub_vrm1_2_sub_vrm1_3_sub_vrm1_4_sub_vrm1_5, undef [[DEF]], killed undef $v0, 0, 3 /* e8 */, implicit $vl, implicit $vtype :: (store unknown-size, align 1)
+ ; CHECK-NEXT: PseudoRET
+ bb.0:
+ successors: %bb.3(0x40000000), %bb.1(0x40000000)
+ liveins: $x10, $v8, $v10
+ %0:gpr = IMPLICIT_DEF
+ %1:vrnov0 = PseudoVMV_V_I_M1 undef %1, 0, -1, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+ %2:vrnov0 = IMPLICIT_DEF
+ undef %3.sub_vrm1_0:vrn6m1nov0 = COPY undef %1
+ %3.sub_vrm1_3:vrn6m1nov0 = COPY %2
+ %3.sub_vrm1_4:vrn6m1nov0 = COPY undef %1
+ BNE undef %0, $x0, %bb.3
+ PseudoBR %bb.1
+ bb.1:
+ successors: %bb.3(0x40000000), %bb.2(0x40000000)
+ BNE killed undef %0, $x0, %bb.3
+ PseudoBR %bb.2
+ bb.2:
+ successors: %bb.3(0x80000000)
+ bb.3:
+ %4:vr = IMPLICIT_DEF
+ early-clobber %4:vr = PseudoVRGATHER_VI_M1 undef %4, killed %1, 0, 0, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+ undef %5.sub_vrm1_0:vrn6m1 = COPY killed %4
+ %5.sub_vrm1_5:vrn6m1 = COPY killed %2
+ PseudoVSSEG6E8_V_M1_MASK killed %5, undef %0, killed undef $v0, 0, 3 /* e8 */, implicit $vl, implicit $vtype :: (store unknown-size, align 1)
+ PseudoRET
+...
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive-O0-ATM-ATK.ll b/llvm/test/CodeGen/RISCV/rvv/sifive-O0-ATM-ATK.ll
new file mode 100644
index 0000000..d9a49a1
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive-O0-ATM-ATK.ll
@@ -0,0 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+v -O0 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-RV64
+
+define void @matmul() {
+; CHECK-RV64-LABEL: matmul:
+; CHECK-RV64: # %bb.0: # %entry
+; CHECK-RV64-NEXT: li a0, 0
+; CHECK-RV64-NEXT: vsetvli zero, a0, 512
+; CHECK-RV64-NEXT: sf.vsettm zero, a0
+; CHECK-RV64-NEXT: sf.vtzero.t mt0
+; CHECK-RV64-NEXT: ret
+entry:
+ call void @llvm.riscv.sf.vtzero.t.i64(i64 0, i64 0, i64 0, i64 3, i64 1)
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn
+declare void @llvm.riscv.sf.vtzero.t.i64(i64 immarg, i64, i64, i64 immarg, i64 immarg) #0
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive-xsfmm-vset-insert.mir b/llvm/test/CodeGen/RISCV/rvv/sifive-xsfmm-vset-insert.mir
new file mode 100644
index 0000000..389283a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive-xsfmm-vset-insert.mir
@@ -0,0 +1,523 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc %s -o - -mtriple=riscv64 -mattr=+v \
+# RUN: -run-pass=phi-node-elimination,register-coalescer,riscv-insert-vsetvli | FileCheck %s
+
+--- |
+ define void @xsfmm_same_state(<vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 noundef %tm, i64 noundef %tn, i64 noundef %tk) {
+ entry:
+ tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 %tm, i64 %tn, i64 %tk, i64 2)
+ tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 %tm, i64 %tn, i64 %tk, i64 2)
+ ret void
+ }
+
+ define void @xsfmm_different_state(<vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 %tm, i64 %tn, i64 %tk) {
+ entry:
+ tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 %tm, i64 %tn, i64 %tk, i64 2)
+ tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 %tm, i64 %tn, i64 %tk, i64 4)
+ ret void
+ }
+
+ define void @xsfmm_different_state_bf(<vscale x 32 x half> %tile1, <vscale x 32 x bfloat> %tile2, i64 %tm, i64 %tn, i64 %tk) {
+ entry:
+ tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile1, i64 %tm, i64 %tn, i64 %tk, i64 2)
+ tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32bf16(i64 2, <vscale x 32 x bfloat> %tile2, <vscale x 32 x bfloat> %tile2, i64 %tm, i64 %tn, i64 %tk, i64 2)
+ tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile1, i64 %tm, i64 %tn, i64 %tk, i64 2)
+ ret void
+ }
+
+ define <vscale x 64 x i8> @interleave_rvv_and_xsfmm(<vscale x 64 x i8> %tile, i64 %vl, ptr %base) {
+ entry:
+ %0 = call <vscale x 64 x i8> @llvm.riscv.sf.vtmv.v.t.nxv64i8.i64(i64 1, i64 %vl)
+ %1 = call <vscale x 64 x i8> @llvm.riscv.vadd.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> poison, <vscale x 64 x i8> %tile, <vscale x 64 x i8> %0, i64 %vl)
+ call void @llvm.riscv.sf.vste16.i64(i64 1, ptr %base, i64 %vl)
+ ret <vscale x 64 x i8> %1
+ }
+
+ define <vscale x 64 x i8> @interleave_rvv_and_xsfmm2(<vscale x 64 x i8> %tile, i64 %vl, ptr %base) {
+ entry:
+ %0 = call <vscale x 64 x i8> @llvm.riscv.vadd.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> poison, <vscale x 64 x i8> %tile, <vscale x 64 x i8> %tile, i64 %vl)
+ %1 = call <vscale x 64 x i8> @llvm.riscv.sf.vtmv.v.t.nxv64i8.i64(i64 1, i64 %vl)
+ %2 = call <vscale x 64 x i8> @llvm.riscv.vadd.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> poison, <vscale x 64 x i8> %tile, <vscale x 64 x i8> %0, i64 %vl)
+ call void @llvm.riscv.sf.vste16.i64(i64 1, ptr %base, i64 %vl)
+ ret <vscale x 64 x i8> %2
+ }
+
+ define void @consecutive_xsfmm(<vscale x 32 x half> %tile, i64 %tm, i64 %tn, i64 %tk, ptr %base) {
+ entry:
+ tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 0, <vscale x 32 x half> %tile, <vscale x 32 x half> %tile, i64 %tm, i64 %tn, i64 %tk, i64 2)
+ call void @llvm.riscv.sf.vste16.i64(i64 0, ptr %base, i64 %tn)
+ ret void
+ }
+
+ define i64 @vsettnt_max(i64 %vl) {
+ entry:
+ %0 = call i64 @llvm.riscv.sf.vsettm.i64(i64 %vl, i64 1, i64 2)
+ %1 = call i64 @llvm.riscv.sf.vsettnt_max.i64(i64 1, i64 2)
+ ret i64 %0
+ }
+
+ define i64 @single_vsettm(i64 %vl) {
+ entry:
+ %0 = call i64 @llvm.riscv.sf.vsettm.i64(i64 %vl, i64 1, i64 2)
+ ret i64 %0
+ }
+
+ define i64 @single_vsettn(i64 %vl) {
+ entry:
+ %0 = call i64 @llvm.riscv.sf.vsettn.i64(i64 %vl, i64 1, i64 2)
+ ret i64 %0
+ }
+
+ define i64 @single_vsettk(i64 %vl) {
+ entry:
+ %0 = call i64 @llvm.riscv.sf.vsettk.i64(i64 %vl, i64 1, i64 2)
+ ret i64 %0
+ }
+
+ define void @sf_vtzero(i64 %tm, i64 %tn) {
+ entry:
+ call void @llvm.riscv.sf.vtzero.i64(i64 1, i64 %tm, i64 %tn, i64 3, i64 4)
+ ret void
+ }
+
+ declare void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64, <vscale x 32 x half>, <vscale x 32 x half>, i64, i64, i64, i64)
+ declare void @llvm.riscv.sf.mm.f.f.i64.nxv32bf16(i64, <vscale x 32 x bfloat>, <vscale x 32 x bfloat>, i64, i64, i64, i64)
+ declare <vscale x 64 x i8> @llvm.riscv.sf.vtmv.v.t.nxv64i8.i64(i64, i64)
+ declare <vscale x 64 x i8> @llvm.riscv.vadd.nxv64i8.nxv64i8.i64(<vscale x 64 x i8>, <vscale x 64 x i8>, <vscale x 64 x i8>, i64)
+ declare void @llvm.riscv.sf.vste16.i64(i64, ptr, i64)
+ declare i64 @llvm.riscv.sf.vsettnt_max.i64(i64, i64)
+ declare i64 @llvm.riscv.sf.vsettm.i64(i64, i64, i64)
+ declare i64 @llvm.riscv.sf.vsettn.i64(i64, i64, i64)
+ declare i64 @llvm.riscv.sf.vsettk.i64(i64, i64, i64)
+ declare void @llvm.riscv.sf.vtzero.i64(i64, i64, i64, i64, i64)
+...
+---
+name: xsfmm_same_state
+alignment: 4
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vrm8 }
+ - { id: 1, class: vrm8 }
+ - { id: 2, class: gprnox0 }
+ - { id: 3, class: gprnox0 }
+ - { id: 4, class: gprnox0 }
+liveins:
+ - { reg: '$v8m8', virtual-reg: '%0' }
+ - { reg: '$v8m8', virtual-reg: '%1' }
+ - { reg: '$x10', virtual-reg: '%2' }
+ - { reg: '$x11', virtual-reg: '%3' }
+ - { reg: '$x12', virtual-reg: '%4' }
+frameInfo:
+ maxAlignment: 1
+machineFunctionInfo: {}
+body: |
+ bb.0.entry:
+ liveins: $v8m8, $v16m8, $x10, $x11, $x12
+ ; CHECK-LABEL: name: xsfmm_same_state
+ ; CHECK: liveins: $v8m8, $v16m8, $x10, $x11, $x12
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x12
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x11
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gprnox0 = COPY $x10
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vrm8 = COPY $v16m8
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vrm8 = COPY $v8m8
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1032 /* e16, w2 */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype
+ ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY3]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype
+ ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY3]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: PseudoRET
+ %4:gprnox0 = COPY $x12
+ %3:gprnox0 = COPY $x11
+ %2:gprnox0 = COPY $x10
+ %1:vrm8 = COPY $v16m8
+ %0:vrm8 = COPY $v8m8
+ PseudoSF_MM_F_F $t2, %0:vrm8, %1:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm
+ PseudoSF_MM_F_F $t2, %0:vrm8, %1:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm
+ PseudoRET
+...
+---
+name: xsfmm_different_state
+alignment: 4
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vrm8 }
+ - { id: 1, class: vrm8 }
+ - { id: 2, class: gprnox0 }
+ - { id: 3, class: gprnox0 }
+ - { id: 4, class: gprnox0 }
+liveins:
+ - { reg: '$v8m8', virtual-reg: '%0' }
+ - { reg: '$v8m8', virtual-reg: '%1' }
+ - { reg: '$x10', virtual-reg: '%2' }
+ - { reg: '$x11', virtual-reg: '%3' }
+ - { reg: '$x12', virtual-reg: '%4' }
+frameInfo:
+ maxAlignment: 1
+machineFunctionInfo: {}
+body: |
+ bb.0.entry:
+ liveins: $v8m8, $v16m8, $x10, $x11, $x12
+ ; CHECK-LABEL: name: xsfmm_different_state
+ ; CHECK: liveins: $v8m8, $v16m8, $x10, $x11, $x12
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x12
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x11
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gprnox0 = COPY $x10
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vrm8 = COPY $v16m8
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vrm8 = COPY $v8m8
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1032 /* e16, w2 */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype
+ ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY3]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1544 /* e16, w4 */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 3, implicit-def $vtype, implicit $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 3, implicit-def $vtype, implicit $vtype
+ ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY3]], 7, $noreg, $noreg, $noreg, 4, 4, implicit $frm, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: PseudoRET
+ %4:gprnox0 = COPY $x12
+ %3:gprnox0 = COPY $x11
+ %2:gprnox0 = COPY $x10
+ %1:vrm8 = COPY $v16m8
+ %0:vrm8 = COPY $v8m8
+ PseudoSF_MM_F_F $t2, %0:vrm8, %1:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm
+ PseudoSF_MM_F_F $t2, %0:vrm8, %1:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 4, implicit $frm
+ PseudoRET
+...
+---
+name: xsfmm_different_state_bf
+alignment: 4
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vrm8 }
+ - { id: 1, class: vrm8 }
+ - { id: 2, class: gprnox0 }
+ - { id: 3, class: gprnox0 }
+ - { id: 4, class: gprnox0 }
+liveins:
+ - { reg: '$v8m8', virtual-reg: '%0' }
+ - { reg: '$v8m8', virtual-reg: '%1' }
+ - { reg: '$x10', virtual-reg: '%2' }
+ - { reg: '$x11', virtual-reg: '%3' }
+ - { reg: '$x12', virtual-reg: '%4' }
+frameInfo:
+ maxAlignment: 1
+machineFunctionInfo: {}
+body: |
+ bb.0.entry:
+ liveins: $v8m8, $v16m8, $x10, $x11, $x12
+ ; CHECK-LABEL: name: xsfmm_different_state_bf
+ ; CHECK: liveins: $v8m8, $v16m8, $x10, $x11, $x12
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x12
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x11
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gprnox0 = COPY $x10
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vrm8 = COPY $v16m8
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vrm8 = COPY $v8m8
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1032 /* e16, w2 */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype
+ ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY4]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1288 /* e16, w2 */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype
+ ; CHECK-NEXT: PseudoSF_MM_F_F_ALT $t2, [[COPY3]], [[COPY3]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1032 /* e16, w2 */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype
+ ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY4]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: PseudoRET
+ %4:gprnox0 = COPY $x12
+ %3:gprnox0 = COPY $x11
+ %2:gprnox0 = COPY $x10
+ %1:vrm8 = COPY $v16m8
+ %0:vrm8 = COPY $v8m8
+ PseudoSF_MM_F_F $t2, %0:vrm8, %0:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm
+ PseudoSF_MM_F_F_ALT $t2, %1:vrm8, %1:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm
+ PseudoSF_MM_F_F $t2, %0:vrm8, %0:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm
+ PseudoRET
+...
+---
+name: interleave_rvv_and_xsfmm
+alignment: 4
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vrm8 }
+ - { id: 1, class: gprnox0 }
+ - { id: 2, class: gpr }
+ - { id: 3, class: gpr }
+ - { id: 4, class: vrm8 }
+ - { id: 5, class: vrm8 }
+liveins:
+ - { reg: '$v8m8', virtual-reg: '%0' }
+ - { reg: '$x10', virtual-reg: '%1' }
+ - { reg: '$x11', virtual-reg: '%2' }
+frameInfo:
+ maxAlignment: 1
+machineFunctionInfo: {}
+body: |
+ bb.0.entry:
+ liveins: $v8m8, $x10, $x11
+ ; CHECK-LABEL: name: interleave_rvv_and_xsfmm
+ ; CHECK: liveins: $v8m8, $x10, $x11
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x11
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x10
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vrm8 = COPY $v8m8
+ ; CHECK-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI $x0, 1
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 512 /* e8, w1 */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: [[PseudoSF_VTMV_V_T:%[0-9]+]]:vrm8 = PseudoSF_VTMV_V_T [[ADDI]], $noreg, 3, 1, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoVSETVLI [[COPY1]], 195 /* e8, m8, ta, ma */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: [[PseudoVADD_VV_M8_:%[0-9]+]]:vrm8 = PseudoVADD_VV_M8 $noreg, [[COPY2]], [[PseudoSF_VTMV_V_T]], $noreg, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: PseudoSF_VSTE16 [[ADDI]], [[COPY]], $noreg, 4, 1, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: $v8m8 = COPY [[PseudoVADD_VV_M8_]], implicit $vtype
+ ; CHECK-NEXT: PseudoRET implicit $v8m8
+ %2:gpr = COPY $x11
+ %1:gprnox0 = COPY $x10
+ %0:vrm8 = COPY $v8m8
+ %3:gpr = ADDI $x0, 1
+ %4:vrm8 = PseudoSF_VTMV_V_T %3:gpr, %1:gprnox0, 3, 1
+ %5:vrm8 = PseudoVADD_VV_M8 $noreg, %0:vrm8, killed %4:vrm8, %1:gprnox0, 3, 0
+ PseudoSF_VSTE16 %3:gpr, %2:gpr, %1:gprnox0, 4, 1
+ $v8m8 = COPY %5:vrm8
+ PseudoRET implicit $v8m8
+...
+---
+name: interleave_rvv_and_xsfmm2
+alignment: 4
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vrm8 }
+ - { id: 1, class: gprnox0 }
+ - { id: 2, class: gpr }
+ - { id: 3, class: gpr }
+ - { id: 4, class: vrm8 }
+ - { id: 5, class: vrm8 }
+liveins:
+ - { reg: '$v8m8', virtual-reg: '%0' }
+ - { reg: '$x10', virtual-reg: '%1' }
+ - { reg: '$x11', virtual-reg: '%2' }
+frameInfo:
+ maxAlignment: 1
+machineFunctionInfo: {}
+body: |
+ bb.0.entry:
+ liveins: $v8m8, $x10, $x11
+ ; CHECK-LABEL: name: interleave_rvv_and_xsfmm2
+ ; CHECK: liveins: $v8m8, $x10, $x11
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x11
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x10
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vrm8 = COPY $v8m8
+ ; CHECK-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI $x0, 1
+ ; CHECK-NEXT: dead $x0 = PseudoVSETVLI [[COPY1]], 195 /* e8, m8, ta, ma */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: [[PseudoVADD_VV_M8_:%[0-9]+]]:vrm8 = PseudoVADD_VV_M8 $noreg, [[COPY2]], [[COPY2]], $noreg, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 512 /* e8, w1 */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: dead [[PseudoSF_VTMV_V_T:%[0-9]+]]:vrm8 = PseudoSF_VTMV_V_T [[ADDI]], $noreg, 3, 1, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoVSETVLI [[COPY1]], 195 /* e8, m8, ta, ma */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: [[PseudoVADD_VV_M8_1:%[0-9]+]]:vrm8 = PseudoVADD_VV_M8 $noreg, [[PseudoVADD_VV_M8_]], [[PseudoVADD_VV_M8_]], $noreg, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: PseudoSF_VSTE16 [[ADDI]], [[COPY]], $noreg, 4, 1, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: $v8m8 = COPY [[PseudoVADD_VV_M8_1]], implicit $vtype
+ ; CHECK-NEXT: PseudoRET implicit $v8m8
+ %2:gpr = COPY $x11
+ %1:gprnox0 = COPY $x10
+ %0:vrm8 = COPY $v8m8
+ %3:gpr = ADDI $x0, 1
+ %4:vrm8 = PseudoVADD_VV_M8 $noreg, %0:vrm8, killed %0:vrm8, %1:gprnox0, 3, 0
+ %5:vrm8 = PseudoSF_VTMV_V_T %3:gpr, %1:gprnox0, 3, 1
+ %6:vrm8 = PseudoVADD_VV_M8 $noreg, %4:vrm8, killed %4:vrm8, %1:gprnox0, 3, 0
+ PseudoSF_VSTE16 %3:gpr, %2:gpr, %1:gprnox0, 4, 1
+ $v8m8 = COPY %6:vrm8
+ PseudoRET implicit $v8m8
+...
+---
+name: consecutive_xsfmm
+alignment: 4
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vrm8 }
+ - { id: 1, class: gprnox0 }
+ - { id: 2, class: gprnox0 }
+ - { id: 3, class: gprnox0 }
+ - { id: 4, class: gprnox0 }
+liveins:
+ - { reg: '$v8m8', virtual-reg: '%0' }
+ - { reg: '$x10', virtual-reg: '%1' }
+ - { reg: '$x11', virtual-reg: '%2' }
+ - { reg: '$x12', virtual-reg: '%3' }
+ - { reg: '$x13', virtual-reg: '%4' }
+frameInfo:
+ maxAlignment: 1
+machineFunctionInfo: {}
+body: |
+ bb.0.entry:
+ liveins: $v8m8, $x10, $x11, $x12, $x13
+ ; CHECK-LABEL: name: consecutive_xsfmm
+ ; CHECK: liveins: $v8m8, $x10, $x11, $x12, $x13
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vrm8 = COPY $v8m8
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x10
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gprnox0 = COPY $x11
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gprnox0 = COPY $x12
+ ; CHECK-NEXT: dead [[COPY4:%[0-9]+]]:gprnox0 = COPY $x13
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY2]], 1032 /* e16, w2 */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY1]], 4, 2, implicit-def $vtype, implicit $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY3]], 4, 2, implicit-def $vtype, implicit $vtype
+ ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY]], [[COPY]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY3]], 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: PseudoSF_VSTE16 [[COPY1]], [[COPY2]], $noreg, 4, 1, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: PseudoRET
+ %0:vrm8 = COPY $v8m8
+ %1:gprnox0 = COPY $x10
+ %2:gprnox0 = COPY $x11
+ %3:gprnox0 = COPY $x12
+ %4:gprnox0 = COPY $x13
+ PseudoSF_MM_F_F $t2, %0:vrm8, %0:vrm8, 7, %1:gprnox0, %2:gprnox0, %3:gprnox0, 4, 2, implicit $frm
+ PseudoSF_VSTE16 %1:gprnox0, %2:gprnox0, %3:gprnox0, 4, 1
+ PseudoRET
+...
+---
+name: vsettnt_max
+alignment: 4
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: gprnox0 }
+liveins:
+ - { reg: '$x10', virtual-reg: '%0' }
+frameInfo:
+ maxAlignment: 1
+machineFunctionInfo: {}
+body: |
+ bb.0.entry:
+ liveins: $x10
+ ; CHECK-LABEL: name: vsettnt_max
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x10
+ ; CHECK-NEXT: dead [[PseudoSF_VSETTNTX0_:%[0-9]+]]:gprnox0 = PseudoSF_VSETTNTX0 killed $x0, 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: dead [[PseudoSF_VSETTK:%[0-9]+]]:gprnox0 = PseudoSF_VSETTK [[COPY]], 4, 1, implicit-def $vtype, implicit $vtype, implicit $vtype
+ ; CHECK-NEXT: dead [[PseudoSF_VSETTNTX0_1:%[0-9]+]]:gprnox0 = PseudoSF_VSETTNTX0 $x0, 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype, implicit $vtype
+ ; CHECK-NEXT: [[PseudoSF_VSETTM:%[0-9]+]]:gprnox0 = PseudoSF_VSETTM [[COPY]], 4, 1, implicit-def $vtype, implicit $vtype, implicit $vtype
+ ; CHECK-NEXT: $x10 = COPY [[PseudoSF_VSETTM]]
+ ; CHECK-NEXT: PseudoRET implicit $x10
+ %0:gprnox0 = COPY $x10
+ %1:gprnox0 = PseudoSF_VSETTK %0:gprnox0, 4, 1, implicit-def $vtype, implicit $vtype
+ %2:gprnox0 = PseudoSF_VSETTNTX0 $x0, 520, implicit-def $vl, implicit-def $vtype, implicit $vtype
+ %3:gprnox0 = PseudoSF_VSETTM %0:gprnox0, 4, 1, implicit-def $vtype, implicit $vtype
+ $x10 = COPY %3:gprnox0
+ PseudoRET implicit $x10
+...
+---
+name: single_vsettm
+alignment: 4
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: gprnox0 }
+liveins:
+ - { reg: '$x10', virtual-reg: '%0' }
+frameInfo:
+ maxAlignment: 1
+machineFunctionInfo: {}
+body: |
+ bb.0.entry:
+ liveins: $x10
+ ; CHECK-LABEL: name: single_vsettm
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x10
+ ; CHECK-NEXT: dead [[PseudoSF_VSETTNTX0_:%[0-9]+]]:gprnox0 = PseudoSF_VSETTNTX0 killed $x0, 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: [[PseudoSF_VSETTM:%[0-9]+]]:gprnox0 = PseudoSF_VSETTM [[COPY]], 4, 1, implicit-def $vtype, implicit $vtype, implicit $vtype
+ ; CHECK-NEXT: $x10 = COPY [[PseudoSF_VSETTM]]
+ ; CHECK-NEXT: PseudoRET implicit $x10
+ %0:gprnox0 = COPY $x10
+ %1:gprnox0 = PseudoSF_VSETTM %0:gprnox0, 4, 1, implicit-def $vtype, implicit $vtype
+ $x10 = COPY %1:gprnox0
+ PseudoRET implicit $x10
+...
+---
+name: single_vsettn
+alignment: 4
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: gprnox0 }
+liveins:
+ - { reg: '$x10', virtual-reg: '%0' }
+frameInfo:
+ maxAlignment: 1
+machineFunctionInfo: {}
+body: |
+ bb.0.entry:
+ liveins: $x10
+ ; CHECK-LABEL: name: single_vsettn
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x10
+ ; CHECK-NEXT: [[PseudoSF_VSETTNT:%[0-9]+]]:gprnox0 = PseudoSF_VSETTNT [[COPY]], 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype, implicit $vtype
+ ; CHECK-NEXT: $x10 = COPY [[PseudoSF_VSETTNT]]
+ ; CHECK-NEXT: PseudoRET implicit $x10
+ %0:gprnox0 = COPY $x10
+ %1:gprnox0 = PseudoSF_VSETTNT %0:gprnox0, 520, implicit-def $vl, implicit-def $vtype, implicit $vtype
+ $x10 = COPY %1:gprnox0
+ PseudoRET implicit $x10
+...
+---
+name: single_vsettk
+alignment: 4
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: gprnox0 }
+liveins:
+ - { reg: '$x10', virtual-reg: '%0' }
+frameInfo:
+ maxAlignment: 1
+machineFunctionInfo: {}
+body: |
+ bb.0.entry:
+ liveins: $x10
+ ; CHECK-LABEL: name: single_vsettk
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x10
+ ; CHECK-NEXT: dead [[PseudoSF_VSETTNTX0_:%[0-9]+]]:gprnox0 = PseudoSF_VSETTNTX0 killed $x0, 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: [[PseudoSF_VSETTK:%[0-9]+]]:gprnox0 = PseudoSF_VSETTK [[COPY]], 4, 1, implicit-def $vtype, implicit $vtype, implicit $vtype
+ ; CHECK-NEXT: $x10 = COPY [[PseudoSF_VSETTK]]
+ ; CHECK-NEXT: PseudoRET implicit $x10
+ %0:gprnox0 = COPY $x10
+ %1:gprnox0 = PseudoSF_VSETTK %0:gprnox0, 4, 1, implicit-def $vtype, implicit $vtype
+ $x10 = COPY %1:gprnox0
+ PseudoRET implicit $x10
+...
+---
+name: sf_vtzero
+alignment: 4
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: gprnox0 }
+ - { id: 1, class: gprnox0 }
+liveins:
+ - { reg: '$x10', virtual-reg: '%0' }
+ - { reg: '$x11', virtual-reg: '%1' }
+frameInfo:
+ maxAlignment: 1
+machineFunctionInfo: {}
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ ; CHECK-LABEL: name: sf_vtzero
+ ; CHECK: liveins: $x10, $x11
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x10
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x11
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1536 /* e8, w4 */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY]], 3, 3, implicit-def $vtype, implicit $vtype
+ ; CHECK-NEXT: PseudoSF_VTZERO_T $t1, $noreg, $noreg, 3, 4, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: PseudoRET
+ %0:gprnox0 = COPY $x10
+ %1:gprnox0 = COPY $x11
+ PseudoSF_VTZERO_T $t1, %0:gprnox0, %1:gprnox0, 3, 4
+ PseudoRET
+...
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e4m3_e4m3.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e4m3_e4m3.ll
new file mode 100644
index 0000000..9b9a849
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e4m3_e4m3.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+xsfmm32a8f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+xsfmm32a8f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.mm.e4m3.e4m3.iXLen.nxv64i8(iXLen, <vscale x 64 x i8>, <vscale x 64 x i8>, iXLen, iXLen, iXLen, iXLen)
+
+define void @test_sf_mm_e4m3_e4m3_w4_u8m8_u8m8(iXLen %mtd, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk) {
+; CHECK-LABEL: test_sf_mm_e4m3_e4m3_w4_u8m8_u8m8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt zero, a2, e8, w4
+; CHECK-NEXT: sf.vsettm zero, a1
+; CHECK-NEXT: sf.vsettk zero, a3
+; CHECK-NEXT: sf.mm.e4m3.e4m3 mt0, v8, v16
+; CHECK-NEXT: ret
+ entry:
+ call void @llvm.riscv.sf.mm.e4m3.e4m3.iXLen.nxv64i8(iXLen 0, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk, iXLen 4)
+ ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e4m3_e5m2.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e4m3_e5m2.ll
new file mode 100644
index 0000000..b63974f
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e4m3_e5m2.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+xsfmm32a8f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+xsfmm32a8f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.mm.e4m3.e5m2.iXLen.nxv64i8(iXLen, <vscale x 64 x i8>, <vscale x 64 x i8>, iXLen, iXLen, iXLen, iXLen)
+
+define void @test_sf_mm_e4m3_e5m2_w4_u8m8_u8m8(iXLen %mtd, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk) {
+; CHECK-LABEL: test_sf_mm_e4m3_e5m2_w4_u8m8_u8m8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt zero, a2, e8, w4
+; CHECK-NEXT: sf.vsettm zero, a1
+; CHECK-NEXT: sf.vsettk zero, a3
+; CHECK-NEXT: sf.mm.e4m3.e5m2 mt0, v8, v16
+; CHECK-NEXT: ret
+ entry:
+ call void @llvm.riscv.sf.mm.e4m3.e5m2.iXLen.nxv64i8(iXLen 0, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk, iXLen 4)
+ ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e5m2_e4m3.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e5m2_e4m3.ll
new file mode 100644
index 0000000..62d629b1
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e5m2_e4m3.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+xsfmm32a8f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+xsfmm32a8f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.mm.e5m2.e4m3.iXLen.nxv64i8(iXLen, <vscale x 64 x i8>, <vscale x 64 x i8>, iXLen, iXLen, iXLen, iXLen)
+
+define void @test_sf_mm_e5m2_e5m2_w4_u8m8_u8m8(iXLen %mtd, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk) {
+; CHECK-LABEL: test_sf_mm_e5m2_e5m2_w4_u8m8_u8m8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt zero, a2, e8, w4
+; CHECK-NEXT: sf.vsettm zero, a1
+; CHECK-NEXT: sf.vsettk zero, a3
+; CHECK-NEXT: sf.mm.e5m2.e4m3 mt0, v8, v16
+; CHECK-NEXT: ret
+ entry:
+ call void @llvm.riscv.sf.mm.e5m2.e4m3.iXLen.nxv64i8(iXLen 0, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk, iXLen 4)
+ ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e5m2_e5m2.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e5m2_e5m2.ll
new file mode 100644
index 0000000..7a90c97
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e5m2_e5m2.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+xsfmm32a8f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+xsfmm32a8f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.mm.e5m2.e5m2.iXLen.nxv64i8(iXLen, <vscale x 64 x i8>, <vscale x 64 x i8>, iXLen, iXLen, iXLen, iXLen)
+
+define void @test_sf_mm_e4m3_e5m2_w4_u8m8_u8m8(iXLen %mtd, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk) {
+; CHECK-LABEL: test_sf_mm_e4m3_e5m2_w4_u8m8_u8m8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt zero, a2, e8, w4
+; CHECK-NEXT: sf.vsettm zero, a1
+; CHECK-NEXT: sf.vsettk zero, a3
+; CHECK-NEXT: sf.mm.e5m2.e5m2 mt0, v8, v16
+; CHECK-NEXT: ret
+ entry:
+ call void @llvm.riscv.sf.mm.e5m2.e5m2.iXLen.nxv64i8(iXLen 0, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk, iXLen 4)
+ ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_f_f.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_f_f.ll
new file mode 100644
index 0000000..29451c6
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_f_f.ll
@@ -0,0 +1,52 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN: -mattr=+zvfh -mattr=+xsfmm32a32f -mattr=+xsfmm64a64f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN: -mattr=+zvfh -mattr=+xsfmm32a32f -mattr=+xsfmm64a64f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.mm.f.f.iXLen.nxv32f16(iXLen, <vscale x 32 x half>, <vscale x 32 x half>, iXLen, iXLen, iXLen, iXLen)
+
+define void @test_sf_mm_f_f_w2_f16m8(iXLen %mtd, <vscale x 32 x half> %v1, <vscale x 32 x half> %v2, iXLen %tm, iXLen %tn, iXLen %tk) {
+; CHECK-LABEL: test_sf_mm_f_f_w2_f16m8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt zero, a2, e16, w2
+; CHECK-NEXT: sf.vsettm zero, a1
+; CHECK-NEXT: sf.vsettk zero, a3
+; CHECK-NEXT: sf.mm.f.f mt0, v8, v16
+; CHECK-NEXT: ret
+ entry:
+ call void @llvm.riscv.sf.mm.f.f.iXLen.nxv32f16(iXLen 0, <vscale x 32 x half> %v1, <vscale x 32 x half> %v2, iXLen %tm, iXLen %tn, iXLen %tk, iXLen 2)
+ ret void
+}
+
+declare void @llvm.riscv.sf.mm.f.f.iXLen.nxv16f32(iXLen, <vscale x 16 x float>, <vscale x 16 x float>, iXLen, iXLen, iXLen, iXLen)
+
+define void @test_sf_mm_f_f_w1_f32m8(iXLen %mtd, <vscale x 16 x float> %v1, <vscale x 16 x float> %v2, iXLen %tm, iXLen %tn, iXLen %tk) {
+; CHECK-LABEL: test_sf_mm_f_f_w1_f32m8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt zero, a2, e32, w1
+; CHECK-NEXT: sf.vsettm zero, a1
+; CHECK-NEXT: sf.vsettk zero, a3
+; CHECK-NEXT: sf.mm.f.f mt0, v8, v16
+; CHECK-NEXT: ret
+ entry:
+ call void @llvm.riscv.sf.mm.f.f.iXLen.nxv16f32(iXLen 0, <vscale x 16 x float> %v1, <vscale x 16 x float> %v2, iXLen %tm, iXLen %tn, iXLen %tk, iXLen 1)
+ ret void
+}
+
+declare void @llvm.riscv.sf.mm.f.f.iXLen.nxv8f64(iXLen, <vscale x 8 x double>, <vscale x 8 x double>, iXLen, iXLen, iXLen, iXLen)
+
+define void @test_sf_mm_f_f_w1_f64m8(iXLen %mtd, <vscale x 8 x double> %v1, <vscale x 8 x double> %v2, iXLen %tm, iXLen %tn, iXLen %tk) {
+; CHECK-LABEL: test_sf_mm_f_f_w1_f64m8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt zero, a2, e64, w1
+; CHECK-NEXT: sf.vsettm zero, a1
+; CHECK-NEXT: sf.vsettk zero, a3
+; CHECK-NEXT: sf.mm.f.f mt0, v8, v16
+; CHECK-NEXT: ret
+ entry:
+ call void @llvm.riscv.sf.mm.f.f.iXLen.nxv8f64(iXLen 0, <vscale x 8 x double> %v1, <vscale x 8 x double> %v2, iXLen %tm, iXLen %tn, iXLen %tk, iXLen 1)
+ ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_s_s.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_s_s.ll
new file mode 100644
index 0000000..6a4b29f
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_s_s.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+xsfmm32a8i \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+xsfmm32a8i \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.mm.s.s.iXLen.nxv64i8.nxv64i8(iXLen, <vscale x 64 x i8>, <vscale x 64 x i8>, iXLen, iXLen, iXLen, iXLen)
+
+define void @test_sf_mm_s_s_w4_i8m8_i8m8(iXLen %mtd, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk) {
+; CHECK-LABEL: test_sf_mm_s_s_w4_i8m8_i8m8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt zero, a2, e8, w4
+; CHECK-NEXT: sf.vsettm zero, a1
+; CHECK-NEXT: sf.vsettk zero, a3
+; CHECK-NEXT: sf.mm.s.s mt0, v8, v16
+; CHECK-NEXT: ret
+ entry:
+ call void @llvm.riscv.sf.mm.s.s.iXLen.nxv64i8.nxv64i8(iXLen 0, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk, iXLen 4)
+ ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_s_u.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_s_u.ll
new file mode 100644
index 0000000..79239b0
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_s_u.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+xsfmm32a8i \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+xsfmm32a8i \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.mm.s.u.iXLen.nxv64i8.nxv64i8(iXLen, <vscale x 64 x i8>, <vscale x 64 x i8>, iXLen, iXLen, iXLen, iXLen)
+
+define void @test_sf_mm_s_u_w4_i8m8_i8m8(iXLen %mtd, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk) {
+; CHECK-LABEL: test_sf_mm_s_u_w4_i8m8_i8m8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt zero, a2, e8, w4
+; CHECK-NEXT: sf.vsettm zero, a1
+; CHECK-NEXT: sf.vsettk zero, a3
+; CHECK-NEXT: sf.mm.s.u mt0, v8, v16
+; CHECK-NEXT: ret
+ entry:
+ call void @llvm.riscv.sf.mm.s.u.iXLen.nxv64i8.nxv64i8(iXLen 0, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk, iXLen 4)
+ ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_u_s.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_u_s.ll
new file mode 100644
index 0000000..b0d039b
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_u_s.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+xsfmm32a8i \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+xsfmm32a8i \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.mm.u.s.iXLen.nxv64i8.nxv64i8(iXLen, <vscale x 64 x i8>, <vscale x 64 x i8>, iXLen, iXLen, iXLen, iXLen)
+
+define void @test_sf_mm_u_s_w4_i8m8_i8m8(iXLen %mtd, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk) {
+; CHECK-LABEL: test_sf_mm_u_s_w4_i8m8_i8m8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt zero, a2, e8, w4
+; CHECK-NEXT: sf.vsettm zero, a1
+; CHECK-NEXT: sf.vsettk zero, a3
+; CHECK-NEXT: sf.mm.u.s mt0, v8, v16
+; CHECK-NEXT: ret
+ entry:
+ call void @llvm.riscv.sf.mm.u.s.iXLen.nxv64i8.nxv64i8(iXLen 0, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk, iXLen 4)
+ ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_u_u.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_u_u.ll
new file mode 100644
index 0000000..913c277
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_u_u.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+xsfmm32a8i \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+xsfmm32a8i \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.mm.u.u.iXLen.nxv64i8.nxv64i8(iXLen, <vscale x 64 x i8>, <vscale x 64 x i8>, iXLen, iXLen, iXLen, iXLen)
+
+define void @test_sf_mm_u_u_w4_i8m8_i8m8(iXLen %mtd, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk) {
+; CHECK-LABEL: test_sf_mm_u_u_w4_i8m8_i8m8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt zero, a2, e8, w4
+; CHECK-NEXT: sf.vsettm zero, a1
+; CHECK-NEXT: sf.vsettk zero, a3
+; CHECK-NEXT: sf.mm.u.u mt0, v8, v16
+; CHECK-NEXT: ret
+ entry:
+ call void @llvm.riscv.sf.mm.u.u.iXLen.nxv64i8.nxv64i8(iXLen 0, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk, iXLen 4)
+ ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte16.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte16.ll
new file mode 100644
index 0000000..8048dec
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte16.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.vlte16.iXLen(iXLen, ptr, iXLen)
+
+define dso_local void @test_sf_vlte16(iXLen %tss, ptr %base, iXLen %vl) {
+; CHECK-LABEL: test_sf_vlte16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt zero, a2, e16, w1
+; CHECK-NEXT: sf.vlte16 a0, (a1)
+; CHECK-NEXT: ret
+ entry:
+ call void @llvm.riscv.sf.vlte16.iXLen(iXLen %tss, ptr %base, iXLen %vl)
+ ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte32.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte32.ll
new file mode 100644
index 0000000..a526dc8
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte32.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.vlte32.iXLen(iXLen, ptr, iXLen)
+
+define dso_local void @test_sf_vlte32(iXLen %tss, ptr %base, iXLen %vl) {
+; CHECK-LABEL: test_sf_vlte32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt zero, a2, e32, w1
+; CHECK-NEXT: sf.vlte32 a0, (a1)
+; CHECK-NEXT: ret
+ entry:
+ call void @llvm.riscv.sf.vlte32.iXLen(iXLen %tss, ptr %base, iXLen %vl)
+ ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte64.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte64.ll
new file mode 100644
index 0000000..ed0c48a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte64.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.vlte64.iXLen(iXLen, ptr, iXLen)
+
+define dso_local void @test_sf_vlte64(iXLen %tss, ptr %base, iXLen %vl) {
+; CHECK-LABEL: test_sf_vlte64:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt zero, a2, e64, w1
+; CHECK-NEXT: sf.vlte64 a0, (a1)
+; CHECK-NEXT: ret
+ entry:
+ call void @llvm.riscv.sf.vlte64.iXLen(iXLen %tss, ptr %base, iXLen %vl)
+ ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte8.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte8.ll
new file mode 100644
index 0000000..67b3ed2
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte8.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.vlte8.iXLen(iXLen, ptr, iXLen)
+
+define dso_local void @test_sf_vlte8(iXLen %tss, ptr %base, iXLen %vl) {
+; CHECK-LABEL: test_sf_vlte8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt zero, a2, e8, w1
+; CHECK-NEXT: sf.vlte8 a0, (a1)
+; CHECK-NEXT: ret
+ entry:
+ call void @llvm.riscv.sf.vlte8.iXLen(iXLen %tss, ptr %base, iXLen %vl)
+ ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vsettk.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vsettk.ll
new file mode 100644
index 0000000..4da37fa
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vsettk.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare iXLen @llvm.riscv.sf.vsettk.iXLen(iXLen, iXLen, iXLen)
+
+define iXLen @test_sf_vsettk(iXLen %tk) {
+; CHECK-LABEL: test_sf_vsettk:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt a1, zero, e16, w2
+; CHECK-NEXT: sf.vsettk a0, a0
+; CHECK-NEXT: ret
+ entry:
+ %0 = call iXLen @llvm.riscv.sf.vsettk.iXLen(iXLen %tk, iXLen 1, iXLen 2)
+ ret iXLen %0
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vsettm.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vsettm.ll
new file mode 100644
index 0000000..143c26c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vsettm.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare iXLen @llvm.riscv.sf.vsettm.iXLen(iXLen, iXLen, iXLen)
+
+define iXLen @test_sf_vsettm(iXLen %tm) {
+; CHECK-LABEL: test_sf_vsettm:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt a1, zero, e8, w4
+; CHECK-NEXT: sf.vsettm a0, a0
+; CHECK-NEXT: ret
+ entry:
+ %0 = call iXLen @llvm.riscv.sf.vsettm.iXLen(iXLen %tm, iXLen 0, iXLen 3)
+ ret iXLen %0
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vsettnt.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vsettnt.ll
new file mode 100644
index 0000000..48fa1bc8
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vsettnt.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare iXLen @llvm.riscv.sf.vsettnt.iXLen(iXLen, iXLen, iXLen)
+
+define iXLen @test_sf_vsettnt_e8w1(iXLen %tn) {
+; CHECK-LABEL: test_sf_vsettnt_e8w1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt a0, a0, e8, w1
+; CHECK-NEXT: ret
+ entry:
+ %0 = call iXLen @llvm.riscv.sf.vsettnt.iXLen(iXLen %tn, iXLen 0, iXLen 1)
+ ret iXLen %0
+}
+
+define iXLen @test_sf_vsettnt_e8w2(iXLen %tn) {
+; CHECK-LABEL: test_sf_vsettnt_e8w2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt a0, a0, e8, w2
+; CHECK-NEXT: ret
+ entry:
+ %0 = call iXLen @llvm.riscv.sf.vsettnt.iXLen(iXLen %tn, iXLen 0, iXLen 2)
+ ret iXLen %0
+}
+
+define iXLen @test_sf_vsettnt_e8w4(iXLen %tn) {
+; CHECK-LABEL: test_sf_vsettnt_e8w4:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt a0, a0, e8, w4
+; CHECK-NEXT: ret
+ entry:
+ %0 = call iXLen @llvm.riscv.sf.vsettnt.iXLen(iXLen %tn, iXLen 0, iXLen 3)
+ ret iXLen %0
+}
+
+define iXLen @test_sf_vsettnt_e16w1(iXLen %tn) {
+; CHECK-LABEL: test_sf_vsettnt_e16w1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt a0, a0, e16, w1
+; CHECK-NEXT: ret
+ entry:
+ %0 = call iXLen @llvm.riscv.sf.vsettnt.iXLen(iXLen %tn, iXLen 1, iXLen 1)
+ ret iXLen %0
+}
+
+define iXLen @test_sf_vsettnt_e16w2(iXLen %tn) {
+; CHECK-LABEL: test_sf_vsettnt_e16w2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt a0, a0, e16, w2
+; CHECK-NEXT: ret
+ entry:
+ %0 = call iXLen @llvm.riscv.sf.vsettnt.iXLen(iXLen %tn, iXLen 1, iXLen 2)
+ ret iXLen %0
+}
+
+define iXLen @test_sf_vsettnt_e16w4(iXLen %tn) {
+; CHECK-LABEL: test_sf_vsettnt_e16w4:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt a0, a0, e16, w4
+; CHECK-NEXT: ret
+ entry:
+ %0 = call iXLen @llvm.riscv.sf.vsettnt.iXLen(iXLen %tn, iXLen 1, iXLen 3)
+ ret iXLen %0
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste16.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste16.ll
new file mode 100644
index 0000000..7a76151
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste16.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.vste16.iXLen(iXLen, ptr, iXLen)
+
+define dso_local void @test_sf_vste16(iXLen %tss, ptr %base, iXLen %vl) {
+; CHECK-LABEL: test_sf_vste16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt zero, a2, e16, w1
+; CHECK-NEXT: sf.vste16 a0, (a1)
+; CHECK-NEXT: ret
+ entry:
+ call void @llvm.riscv.sf.vste16.iXLen(iXLen %tss, ptr %base, iXLen %vl)
+ ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste32.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste32.ll
new file mode 100644
index 0000000..8ff6e6a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste32.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.vste32.iXLen(iXLen, ptr, iXLen)
+
+define dso_local void @test_sf_vste32(iXLen %tss, ptr %base, iXLen %vl) {
+; CHECK-LABEL: test_sf_vste32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt zero, a2, e32, w1
+; CHECK-NEXT: sf.vste32 a0, (a1)
+; CHECK-NEXT: ret
+ entry:
+ call void @llvm.riscv.sf.vste32.iXLen(iXLen %tss, ptr %base, iXLen %vl)
+ ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste64.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste64.ll
new file mode 100644
index 0000000..53990e4
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste64.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.vste64.iXLen(iXLen, ptr, iXLen)
+
+define dso_local void @test_sf_vste64(iXLen %tss, ptr %base, iXLen %vl) {
+; CHECK-LABEL: test_sf_vste64:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt zero, a2, e64, w1
+; CHECK-NEXT: sf.vste64 a0, (a1)
+; CHECK-NEXT: ret
+ entry:
+ call void @llvm.riscv.sf.vste64.iXLen(iXLen %tss, ptr %base, iXLen %vl)
+ ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste8.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste8.ll
new file mode 100644
index 0000000..09b7259
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste8.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.vste8.iXLen(iXLen, ptr, iXLen)
+
+define dso_local void @test_sf_vste8(iXLen %tss, ptr %base, iXLen %vl) {
+; CHECK-LABEL: test_sf_vste8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt zero, a2, e8, w1
+; CHECK-NEXT: sf.vste8 a0, (a1)
+; CHECK-NEXT: ret
+ entry:
+ call void @llvm.riscv.sf.vste8.iXLen(iXLen %tss, ptr %base, iXLen %vl)
+ ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtdiscard.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtdiscard.ll
new file mode 100644
index 0000000..394eb60
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtdiscard.ll
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.vtdiscard()
+
+define dso_local void @test_sf_vtdiscard() {
+; CHECK-LABEL: test_sf_vtdiscard:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vtdiscard
+; CHECK-NEXT: ret
+ entry:
+ call void @llvm.riscv.sf.vtdiscard()
+ ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtmv_t_v.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtmv_t_v.ll
new file mode 100644
index 0000000..66c9d26
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtmv_t_v.ll
@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN: -mattr=+zvfh -mattr=+zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN: -mattr=+zvfh -mattr=+zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.vtmv.t.v.nxv32bf16.iXLen(iXLen, <vscale x 32 x bfloat>, iXLen)
+
+define void @test_sf_vtmv_t_v_bf16m8(iXLen %tss, <vscale x 32 x bfloat> %src, iXLen %vl) {
+; CHECK-LABEL: test_sf_vtmv_t_v_bf16m8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt zero, a1, e16, w1
+; CHECK-NEXT: sf.vtmv.t.v a0, v8
+; CHECK-NEXT: ret
+ entry:
+ call void @llvm.riscv.sf.vtmv.t.v.nxv32bf16.iXLen(iXLen %tss, <vscale x 32 x bfloat> %src, iXLen %vl)
+ ret void
+}
+
+declare void @llvm.riscv.sf.vtmv.t.v.nxv32f16.iXLen(iXLen, <vscale x 32 x half>, iXLen)
+
+define void @test_sf_vtmv_t_v_f16(iXLen %tss, <vscale x 32 x half> %src, iXLen %vl) {
+; CHECK-LABEL: test_sf_vtmv_t_v_f16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt zero, a1, e16, w1
+; CHECK-NEXT: sf.vtmv.t.v a0, v8
+; CHECK-NEXT: ret
+ entry:
+ call void @llvm.riscv.sf.vtmv.t.v.nxv32f16.iXLen(iXLen %tss, <vscale x 32 x half> %src, iXLen %vl)
+ ret void
+}
+
+declare void @llvm.riscv.sf.vtmv.t.v.nxv16f32.iXLen(iXLen, <vscale x 16 x float>, iXLen)
+
+define void @test_sf_vtmv_t_v_f32(iXLen %tss, <vscale x 16 x float> %src, iXLen %vl) {
+; CHECK-LABEL: test_sf_vtmv_t_v_f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt zero, a1, e32, w1
+; CHECK-NEXT: sf.vtmv.t.v a0, v8
+; CHECK-NEXT: ret
+ entry:
+ call void @llvm.riscv.sf.vtmv.t.v.nxv16f32.iXLen(iXLen %tss, <vscale x 16 x float> %src, iXLen %vl)
+ ret void
+}
+
+declare void @llvm.riscv.sf.vtmv.t.v.nxv8f64.iXLen(iXLen, <vscale x 8 x double>, iXLen)
+
+define void @test_sf_vtmv_t_v_f64(iXLen %tss, <vscale x 8 x double> %src, iXLen %vl) {
+; CHECK-LABEL: test_sf_vtmv_t_v_f64:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt zero, a1, e64, w1
+; CHECK-NEXT: sf.vtmv.t.v a0, v8
+; CHECK-NEXT: ret
+ entry:
+ call void @llvm.riscv.sf.vtmv.t.v.nxv8f64.iXLen(iXLen %tss, <vscale x 8 x double> %src, iXLen %vl)
+ ret void
+}
+
+declare void @llvm.riscv.sf.vtmv.t.v.nxv64i8.iXLen(iXLen, <vscale x 64 x i8>, iXLen)
+
+define void @test_sf_vtmv_t_v_i8(iXLen %tss, <vscale x 64 x i8> %src, iXLen %vl) {
+; CHECK-LABEL: test_sf_vtmv_t_v_i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt zero, a1, e8, w1
+; CHECK-NEXT: sf.vtmv.t.v a0, v8
+; CHECK-NEXT: ret
+ entry:
+ call void @llvm.riscv.sf.vtmv.t.v.nxv64i8.iXLen(iXLen %tss, <vscale x 64 x i8> %src, iXLen %vl)
+ ret void
+}
+
+declare void @llvm.riscv.sf.vtmv.t.v.nxv32i16.iXLen(iXLen, <vscale x 32 x i16>, iXLen)
+
+define void @test_sf_vtmv_t_v_i16(iXLen %tss, <vscale x 32 x i16> %src, iXLen %vl) {
+; CHECK-LABEL: test_sf_vtmv_t_v_i16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt zero, a1, e16, w1
+; CHECK-NEXT: sf.vtmv.t.v a0, v8
+; CHECK-NEXT: ret
+ entry:
+ call void @llvm.riscv.sf.vtmv.t.v.nxv32i16.iXLen(iXLen %tss, <vscale x 32 x i16> %src, iXLen %vl)
+ ret void
+}
+
+declare void @llvm.riscv.sf.vtmv.t.v.nxv16i32.iXLen(iXLen, <vscale x 16 x i32>, iXLen)
+
+define void @test_sf_vtmv_t_v_i32(iXLen %tss, <vscale x 16 x i32> %src, iXLen %vl) {
+; CHECK-LABEL: test_sf_vtmv_t_v_i32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt zero, a1, e32, w1
+; CHECK-NEXT: sf.vtmv.t.v a0, v8
+; CHECK-NEXT: ret
+ entry:
+ call void @llvm.riscv.sf.vtmv.t.v.nxv16i32.iXLen(iXLen %tss, <vscale x 16 x i32> %src, iXLen %vl)
+ ret void
+}
+
+declare void @llvm.riscv.sf.vtmv.t.v.nxv8i64.iXLen(iXLen, <vscale x 8 x i64>, iXLen)
+
+define void @test_sf_vtmv_t_v_i64(iXLen %tss, <vscale x 8 x i64> %src, iXLen %vl) {
+; CHECK-LABEL: test_sf_vtmv_t_v_i64:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt zero, a1, e64, w1
+; CHECK-NEXT: sf.vtmv.t.v a0, v8
+; CHECK-NEXT: ret
+ entry:
+ call void @llvm.riscv.sf.vtmv.t.v.nxv8i64.iXLen(iXLen %tss, <vscale x 8 x i64> %src, iXLen %vl)
+ ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtmv_v_t.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtmv_v_t.ll
new file mode 100644
index 0000000..0dcc2ab
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtmv_v_t.ll
@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN: -mattr=+zvfh -mattr=+zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN: -mattr=+zvfh -mattr=+zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare <vscale x 32 x bfloat> @llvm.riscv.sf.vtmv.v.t.nxv32bf16.iXLen(iXLen, iXLen)
+
+define <vscale x 32 x bfloat> @test_sf_vtmv_v_t_bf16m8(iXLen %tss, iXLen %vl) {
+; CHECK-LABEL: test_sf_vtmv_v_t_bf16m8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt zero, a1, e16, w1
+; CHECK-NEXT: sf.vtmv.v.t v8, a0
+; CHECK-NEXT: ret
+ entry:
+ %0 = call <vscale x 32 x bfloat> @llvm.riscv.sf.vtmv.v.t.nxv32bf16.iXLen(iXLen %tss, iXLen %vl)
+ ret <vscale x 32 x bfloat> %0
+}
+
+declare <vscale x 32 x half> @llvm.riscv.sf.vtmv.v.t.nxv32f16.iXLen(iXLen, iXLen)
+
+define <vscale x 32 x half> @test_sf_vtmv_v_t_f16(iXLen %tss, iXLen %vl) {
+; CHECK-LABEL: test_sf_vtmv_v_t_f16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt zero, a1, e16, w1
+; CHECK-NEXT: sf.vtmv.v.t v8, a0
+; CHECK-NEXT: ret
+ entry:
+ %0 = call <vscale x 32 x half> @llvm.riscv.sf.vtmv.v.t.nxv32f16.iXLen(iXLen %tss, iXLen %vl)
+ ret <vscale x 32 x half> %0
+}
+
+declare <vscale x 16 x float> @llvm.riscv.sf.vtmv.v.t.nxv16f32.iXLen(iXLen, iXLen)
+
+define <vscale x 16 x float> @test_sf_vtmv_v_t_f32(iXLen %tss, iXLen %vl) {
+; CHECK-LABEL: test_sf_vtmv_v_t_f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt zero, a1, e32, w1
+; CHECK-NEXT: sf.vtmv.v.t v8, a0
+; CHECK-NEXT: ret
+ entry:
+ %0 = call <vscale x 16 x float> @llvm.riscv.sf.vtmv.v.t.nxv16f32.iXLen(iXLen %tss, iXLen %vl)
+ ret <vscale x 16 x float> %0
+}
+
+declare <vscale x 8 x double> @llvm.riscv.sf.vtmv.v.t.nxv8f64.iXLen(iXLen, iXLen)
+
+define <vscale x 8 x double> @test_sf_vtmv_v_t_f64(iXLen %tss, iXLen %vl) {
+; CHECK-LABEL: test_sf_vtmv_v_t_f64:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt zero, a1, e64, w1
+; CHECK-NEXT: sf.vtmv.v.t v8, a0
+; CHECK-NEXT: ret
+ entry:
+ %0 = call <vscale x 8 x double> @llvm.riscv.sf.vtmv.v.t.nxv8f64.iXLen(iXLen %tss, iXLen %vl)
+ ret <vscale x 8 x double> %0
+}
+
+declare <vscale x 64 x i8> @llvm.riscv.sf.vtmv.v.t.nxv64i8.iXLen(iXLen, iXLen)
+
+define <vscale x 64 x i8> @test_sf_vtmv_v_t_i8(iXLen %tss, iXLen %vl) {
+; CHECK-LABEL: test_sf_vtmv_v_t_i8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt zero, a1, e8, w1
+; CHECK-NEXT: sf.vtmv.v.t v8, a0
+; CHECK-NEXT: ret
+ entry:
+ %0 = call <vscale x 64 x i8> @llvm.riscv.sf.vtmv.v.t.nxv64i8.iXLen(iXLen %tss, iXLen %vl)
+ ret <vscale x 64 x i8> %0
+}
+
+declare <vscale x 32 x i16> @llvm.riscv.sf.vtmv.v.t.nxv32i16.iXLen(iXLen, iXLen)
+
+define <vscale x 32 x i16> @test_sf_vtmv_v_t_i16(iXLen %tss, iXLen %vl) {
+; CHECK-LABEL: test_sf_vtmv_v_t_i16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt zero, a1, e16, w1
+; CHECK-NEXT: sf.vtmv.v.t v8, a0
+; CHECK-NEXT: ret
+ entry:
+ %0 = call <vscale x 32 x i16> @llvm.riscv.sf.vtmv.v.t.nxv32i16.iXLen(iXLen %tss, iXLen %vl)
+ ret <vscale x 32 x i16> %0
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.sf.vtmv.v.t.nxv16i32.iXLen(iXLen, iXLen)
+
+define <vscale x 16 x i32> @test_sf_vtmv_v_t_i32(iXLen %tss, iXLen %vl) {
+; CHECK-LABEL: test_sf_vtmv_v_t_i32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt zero, a1, e32, w1
+; CHECK-NEXT: sf.vtmv.v.t v8, a0
+; CHECK-NEXT: ret
+ entry:
+ %0 = call <vscale x 16 x i32> @llvm.riscv.sf.vtmv.v.t.nxv16i32.iXLen(iXLen %tss, iXLen %vl)
+ ret <vscale x 16 x i32> %0
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.sf.vtmv.v.t.nxv8i64.iXLen(iXLen, iXLen)
+
+define <vscale x 8 x i64> @test_sf_vtmv_v_t_i64(iXLen %tss, iXLen %vl) {
+; CHECK-LABEL: test_sf_vtmv_v_t_i64:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt zero, a1, e64, w1
+; CHECK-NEXT: sf.vtmv.v.t v8, a0
+; CHECK-NEXT: ret
+ entry:
+ %0 = call <vscale x 8 x i64> @llvm.riscv.sf.vtmv.v.t.nxv8i64.iXLen(iXLen %tss, iXLen %vl)
+ ret <vscale x 8 x i64> %0
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtzero_t.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtzero_t.ll
new file mode 100644
index 0000000..bbccb02
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtzero_t.ll
@@ -0,0 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.vtzero.t.iXLen(iXLen, iXLen, iXLen, iXLen, iXLen)
+define void @test_sf_vtzero_t(iXLen %tm, iXLen %tn) {
+; CHECK-LABEL: test_sf_vtzero_t:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sf.vsettnt zero, a1, e8, w4
+; CHECK-NEXT: sf.vsettm zero, a0
+; CHECK-NEXT: sf.vtzero.t mt0
+; CHECK-NEXT: ret
+ entry:
+ call void @llvm.riscv.sf.vtzero.t.iXLen(iXLen 0, iXLen %tm, iXLen %tn, iXLen 3, iXLen 4)
+ ret void
+}
+
diff --git a/llvm/test/CodeGen/RISCV/select-to-and-zext.ll b/llvm/test/CodeGen/RISCV/select-to-and-zext.ll
index 2f03ff9..318268a 100644
--- a/llvm/test/CodeGen/RISCV/select-to-and-zext.ll
+++ b/llvm/test/CodeGen/RISCV/select-to-and-zext.ll
@@ -15,8 +15,7 @@ define i32 @from_cmpeq(i32 %xx, i32 %y) {
;
; RV64I-LABEL: from_cmpeq:
; RV64I: # %bb.0:
-; RV64I-NEXT: sext.w a0, a0
-; RV64I-NEXT: addi a0, a0, -9
+; RV64I-NEXT: addiw a0, a0, -9
; RV64I-NEXT: seqz a0, a0
; RV64I-NEXT: and a0, a0, a1
; RV64I-NEXT: ret
@@ -39,8 +38,7 @@ define i32 @from_cmpeq_fail_bad_andmask(i32 %xx, i32 %y) {
;
; RV64I-LABEL: from_cmpeq_fail_bad_andmask:
; RV64I: # %bb.0:
-; RV64I-NEXT: sext.w a0, a0
-; RV64I-NEXT: addi a0, a0, -9
+; RV64I-NEXT: addiw a0, a0, -9
; RV64I-NEXT: snez a0, a0
; RV64I-NEXT: addi a0, a0, -1
; RV64I-NEXT: and a0, a1, a0
diff --git a/llvm/test/CodeGen/RISCV/setcc-logic.ll b/llvm/test/CodeGen/RISCV/setcc-logic.ll
index fabb573..4e14893 100644
--- a/llvm/test/CodeGen/RISCV/setcc-logic.ll
+++ b/llvm/test/CodeGen/RISCV/setcc-logic.ll
@@ -104,9 +104,8 @@ define i1 @and_icmps_const_not1bit_diff(i32 %x) nounwind {
;
; RV64I-LABEL: and_icmps_const_not1bit_diff:
; RV64I: # %bb.0:
-; RV64I-NEXT: sext.w a0, a0
-; RV64I-NEXT: addi a1, a0, -44
-; RV64I-NEXT: addi a0, a0, -92
+; RV64I-NEXT: addiw a1, a0, -44
+; RV64I-NEXT: addiw a0, a0, -92
; RV64I-NEXT: snez a1, a1
; RV64I-NEXT: snez a0, a0
; RV64I-NEXT: and a0, a1, a0
diff --git a/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll b/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll
index bdbe4ed..07bfbe6 100644
--- a/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll
+++ b/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll
@@ -674,8 +674,7 @@ define i32 @sext_of_not_cmp_i32(i32 %x) {
;
; RV64-LABEL: sext_of_not_cmp_i32:
; RV64: # %bb.0:
-; RV64-NEXT: sext.w a0, a0
-; RV64-NEXT: addi a0, a0, -7
+; RV64-NEXT: addiw a0, a0, -7
; RV64-NEXT: seqz a0, a0
; RV64-NEXT: addi a0, a0, -1
; RV64-NEXT: ret
@@ -718,8 +717,7 @@ define i32 @dec_of_zexted_cmp_i32(i32 %x) {
;
; RV64-LABEL: dec_of_zexted_cmp_i32:
; RV64: # %bb.0:
-; RV64-NEXT: sext.w a0, a0
-; RV64-NEXT: addi a0, a0, -7
+; RV64-NEXT: addiw a0, a0, -7
; RV64-NEXT: seqz a0, a0
; RV64-NEXT: addi a0, a0, -1
; RV64-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll
index 2751332c..bf6802d 100644
--- a/llvm/test/CodeGen/RISCV/xaluo.ll
+++ b/llvm/test/CodeGen/RISCV/xaluo.ll
@@ -1047,8 +1047,8 @@ define zeroext i1 @usubo.i32.constant.lhs(i32 signext %v1, ptr %res) {
; RV64-LABEL: usubo.i32.constant.lhs:
; RV64: # %bb.0: # %entry
; RV64-NEXT: li a2, -2
-; RV64-NEXT: subw a2, a2, a0
-; RV64-NEXT: addi a0, a2, 1
+; RV64-NEXT: sub a2, a2, a0
+; RV64-NEXT: addiw a0, a2, 1
; RV64-NEXT: seqz a0, a0
; RV64-NEXT: sw a2, 0(a1)
; RV64-NEXT: ret
@@ -1065,8 +1065,8 @@ define zeroext i1 @usubo.i32.constant.lhs(i32 signext %v1, ptr %res) {
; RV64ZBA-LABEL: usubo.i32.constant.lhs:
; RV64ZBA: # %bb.0: # %entry
; RV64ZBA-NEXT: li a2, -2
-; RV64ZBA-NEXT: subw a2, a2, a0
-; RV64ZBA-NEXT: addi a0, a2, 1
+; RV64ZBA-NEXT: sub a2, a2, a0
+; RV64ZBA-NEXT: addiw a0, a2, 1
; RV64ZBA-NEXT: seqz a0, a0
; RV64ZBA-NEXT: sw a2, 0(a1)
; RV64ZBA-NEXT: ret
@@ -1083,8 +1083,8 @@ define zeroext i1 @usubo.i32.constant.lhs(i32 signext %v1, ptr %res) {
; RV64ZICOND-LABEL: usubo.i32.constant.lhs:
; RV64ZICOND: # %bb.0: # %entry
; RV64ZICOND-NEXT: li a2, -2
-; RV64ZICOND-NEXT: subw a2, a2, a0
-; RV64ZICOND-NEXT: addi a0, a2, 1
+; RV64ZICOND-NEXT: sub a2, a2, a0
+; RV64ZICOND-NEXT: addiw a0, a2, 1
; RV64ZICOND-NEXT: seqz a0, a0
; RV64ZICOND-NEXT: sw a2, 0(a1)
; RV64ZICOND-NEXT: ret
diff --git a/llvm/test/CodeGen/SPIRV/FCmpFalse.ll b/llvm/test/CodeGen/SPIRV/FCmpFalse.ll
new file mode 100644
index 0000000..55d64196
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/FCmpFalse.ll
@@ -0,0 +1,10 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: %[[#FalseVal:]] = OpConstantFalse %[[#]]
+; CHECK: OpReturnValue %[[#FalseVal:]]
+
+define spir_func i1 @f(float %0) {
+ %2 = fcmp false float %0, %0
+ ret i1 %2
+}
diff --git a/llvm/test/CodeGen/SPIRV/FCmpFalse_Vec.ll b/llvm/test/CodeGen/SPIRV/FCmpFalse_Vec.ll
new file mode 100644
index 0000000..c410b64
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/FCmpFalse_Vec.ll
@@ -0,0 +1,13 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: %[[#BoolTy:]] = OpTypeBool
+; CHECK: %[[#VecTy:]] = OpTypeVector %[[#BoolTy]] 4
+; CHECK: %[[#False:]] = OpConstantFalse %[[#BoolTy]]
+; CHECK: %[[#Composite:]] = OpConstantComposite %[[#VecTy]] %[[#False]] %[[#False]] %[[#False]] %[[#False]]
+; CHECK: OpReturnValue %[[#Composite]]
+
+define spir_func <4 x i1> @test(<4 x float> %a) {
+ %compare = fcmp false <4 x float> %a, %a
+ ret <4 x i1> %compare
+}
diff --git a/llvm/test/CodeGen/SPIRV/builtin_duplicate.ll b/llvm/test/CodeGen/SPIRV/builtin_duplicate.ll
new file mode 100644
index 0000000..8786554
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/builtin_duplicate.ll
@@ -0,0 +1,20 @@
+;; This test checks if we generate a single builtin variable for the following
+;; LLVM IR.
+;; @__spirv_BuiltInLocalInvocationId - A global variable
+;; %3 = tail call i64 @_Z12get_local_idj(i32 0) - A function call
+
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: OpName %[[#]] "__spirv_BuiltInLocalInvocationId"
+; CHECK-NOT: OpName %[[#]] "__spirv_BuiltInLocalInvocationId.1"
+
+@__spirv_BuiltInLocalInvocationId = external dso_local local_unnamed_addr addrspace(1) constant <3 x i64>, align 32
+
+declare spir_func i64 @_Z12get_local_idj(i32) local_unnamed_addr
+
+define spir_kernel void @test(i32 %a) {
+entry:
+ %builtin_call = tail call i64 @_Z12get_local_idj(i32 0)
+ ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/complex-constexpr.ll b/llvm/test/CodeGen/SPIRV/complex-constexpr.ll
new file mode 100644
index 0000000..e2c1d00
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/complex-constexpr.ll
@@ -0,0 +1,21 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+@.str.1 = private unnamed_addr addrspace(1) constant [1 x i8] zeroinitializer, align 1
+
+define linkonce_odr hidden spir_func void @test() {
+entry:
+; CHECK: %[[#MinusOne:]] = OpConstant %[[#]] 18446744073709551615
+; CHECK: %[[#Ptr:]] = OpConvertUToPtr %[[#]] %[[#MinusOne]]
+; CHECK: %[[#PtrCast:]] = OpPtrCastToGeneric %[[#]] %[[#]]
+; CHECK: %[[#]] = OpFunctionCall %[[#]] %[[#]] %[[#PtrCast]] %[[#Ptr]]
+
+ %cast = bitcast ptr addrspace(4) inttoptr (i64 -1 to ptr addrspace(4)) to ptr addrspace(4)
+ call spir_func void @bar(ptr addrspace(4) addrspacecast (ptr addrspace(1) @.str.1 to ptr addrspace(4)), ptr addrspace(4) %cast)
+ ret void
+}
+
+define linkonce_odr hidden spir_func void @bar(ptr addrspace(4) %begin, ptr addrspace(4) %end) {
+entry:
+ ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/dominator-order.ll b/llvm/test/CodeGen/SPIRV/dominator-order.ll
new file mode 100644
index 0000000..2ecdddc
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/dominator-order.ll
@@ -0,0 +1,25 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; This test checks that basic blocks are reordered in SPIR-V so that dominators
+; are emitted ahead of their dominated blocks as required by the SPIR-V
+; specification.
+
+; CHECK-DAG: OpName %[[#ENTRY:]] "entry"
+; CHECK-DAG: OpName %[[#FOR_BODY137_LR_PH:]] "for.body137.lr.ph"
+; CHECK-DAG: OpName %[[#FOR_BODY:]] "for.body"
+
+; CHECK: %[[#ENTRY]] = OpLabel
+; CHECK: %[[#FOR_BODY]] = OpLabel
+; CHECK: %[[#FOR_BODY137_LR_PH]] = OpLabel
+
+define spir_kernel void @test(ptr addrspace(1) %arg, i1 %cond) {
+entry:
+ br label %for.body
+
+for.body137.lr.ph: ; preds = %for.body
+ ret void
+
+for.body: ; preds = %for.body, %entry
+ br i1 %cond, label %for.body, label %for.body137.lr.ph
+}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_maximal_reconvergence/enable-maximal-reconvergence.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_maximal_reconvergence/enable-maximal-reconvergence.ll
new file mode 100644
index 0000000..105f4a4
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_maximal_reconvergence/enable-maximal-reconvergence.ll
@@ -0,0 +1,21 @@
+; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.6-unknown-vulkan1.3-compute --spirv-ext=+SPV_KHR_maximal_reconvergence %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.6-unknown-vulkan1.3-compute --spirv-ext=+SPV_KHR_maximal_reconvergence %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: OpCapability Shader
+; CHECK: OpExtension "SPV_KHR_maximal_reconvergence"
+; CHECK-NOT: OpExecutionMode {{.*}} MaximallyReconvergesKHR
+; CHECK: OpExecutionMode [[main:%[0-9]+]] MaximallyReconvergesKHR
+; CHECK-NOT: OpExecutionMode {{.*}} MaximallyReconvergesKHR
+; CHECK: OpName [[main]] "main"
+define void @main() local_unnamed_addr #0 {
+entry:
+ ret void
+}
+
+define void @negative() local_unnamed_addr #1 {
+entry:
+ ret void
+}
+
+attributes #0 = { "enable-maximal-reconvergence"="true" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+attributes #1 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/DynamicIdx/RWBufferDynamicIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/DynamicIdx/RWBufferDynamicIdx.ll
new file mode 100644
index 0000000..cce1eda
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/DynamicIdx/RWBufferDynamicIdx.ll
@@ -0,0 +1,22 @@
+; RUN: llc -O0 -mtriple=spirv1.6-unknown-vulkan1.3-compute %s -o - | FileCheck %s --match-full-lines
+
+%"__cblayout_$Globals" = type <{ i32 }>
+
+@i = external hidden local_unnamed_addr addrspace(12) global i32, align 4
+@ReadWriteBuf.str = private unnamed_addr constant [13 x i8] c"ReadWriteBuf\00", align 1
+@"$Globals.cb" = local_unnamed_addr global target("spirv.VulkanBuffer", target("spirv.Layout", %"__cblayout_$Globals", 4, 0), 2, 0) poison
+@"$Globals.str" = private unnamed_addr constant [9 x i8] c"$Globals\00", align 1
+
+; CHECK: OpCapability Shader
+; CHECK: OpCapability StorageTexelBufferArrayDynamicIndexingEXT
+
+define void @main() local_unnamed_addr #0 {
+entry:
+ %"$Globals.cb_h.i.i" = tail call target("spirv.VulkanBuffer", target("spirv.Layout", %"__cblayout_$Globals", 4, 0), 2, 0) @"llvm.spv.resource.handlefromimplicitbinding.tspirv.VulkanBuffer_tspirv.Layout_s___cblayout_$Globalss_4_0t_2_0t"(i32 1, i32 0, i32 1, i32 0, ptr nonnull @"$Globals.str")
+ store target("spirv.VulkanBuffer", target("spirv.Layout", %"__cblayout_$Globals", 4, 0), 2, 0) %"$Globals.cb_h.i.i", ptr @"$Globals.cb", align 8
+ %0 = load i32, ptr addrspace(12) @i, align 4
+ %1 = tail call target("spirv.Image", i32, 5, 2, 0, 0, 2, 33) @llvm.spv.resource.handlefromimplicitbinding.tspirv.Image_i32_5_2_0_0_2_33t(i32 0, i32 0, i32 64, i32 %0, ptr nonnull @ReadWriteBuf.str)
+ %2 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.Image_i32_5_2_0_0_2_33t(target("spirv.Image", i32, 5, 2, 0, 0, 2, 33) %1, i32 98)
+ store i32 99, ptr addrspace(11) %2, align 4
+ ret void
+} \ No newline at end of file
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/DynamicIdx/RWStructuredBufferDynamicIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/DynamicIdx/RWStructuredBufferDynamicIdx.ll
new file mode 100644
index 0000000..da69a2f
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/DynamicIdx/RWStructuredBufferDynamicIdx.ll
@@ -0,0 +1,21 @@
+; RUN: llc -O0 -mtriple=spirv1.6-unknown-vulkan1.3-compute %s -o - | FileCheck %s --match-full-lines
+
+%"__cblayout_$Globals" = type <{ i32 }>
+
+@i = external hidden local_unnamed_addr addrspace(12) global i32, align 4
+@ReadWriteStructuredBuf.str = private unnamed_addr constant [23 x i8] c"ReadWriteStructuredBuf\00", align 1
+@"$Globals.cb" = local_unnamed_addr global target("spirv.VulkanBuffer", target("spirv.Layout", %"__cblayout_$Globals", 4, 0), 2, 0) poison
+@"$Globals.str" = private unnamed_addr constant [9 x i8] c"$Globals\00", align 1
+
+; CHECK: OpCapability Shader
+; CHECK: OpCapability StorageBufferArrayDynamicIndexing
+define void @main() local_unnamed_addr #0 {
+entry:
+ %"$Globals.cb_h.i.i" = tail call target("spirv.VulkanBuffer", target("spirv.Layout", %"__cblayout_$Globals", 4, 0), 2, 0) @"llvm.spv.resource.handlefromimplicitbinding.tspirv.VulkanBuffer_tspirv.Layout_s___cblayout_$Globalss_4_0t_2_0t"(i32 2, i32 0, i32 1, i32 0, ptr nonnull @"$Globals.str")
+ store target("spirv.VulkanBuffer", target("spirv.Layout", %"__cblayout_$Globals", 4, 0), 2, 0) %"$Globals.cb_h.i.i", ptr @"$Globals.cb", align 8
+ %0 = load i32, ptr addrspace(12) @i, align 4
+ %1 = tail call target("spirv.VulkanBuffer", [0 x i32], 12, 1) @llvm.spv.resource.handlefromimplicitbinding.tspirv.VulkanBuffer_a0i32_12_1t(i32 0, i32 0, i32 64, i32 %0, ptr nonnull @ReadWriteStructuredBuf.str)
+ %2 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0i32_12_1t(target("spirv.VulkanBuffer", [0 x i32], 12, 1) %1, i32 99)
+ store i32 98, ptr addrspace(11) %2, align 4
+ ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/NonUniformIdx/StructuredBufferNonUniformIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/NonUniformIdx/RWBufferNonUniformIdx.ll
index 92efad9..92efad9 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/NonUniformIdx/StructuredBufferNonUniformIdx.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/NonUniformIdx/RWBufferNonUniformIdx.ll
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/NonUniformIdx/RWStructuredBufferNonUniformIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/NonUniformIdx/RWStructuredBufferNonUniformIdx.ll
index 2a12baf..a820e7a 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/NonUniformIdx/RWStructuredBufferNonUniformIdx.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/NonUniformIdx/RWStructuredBufferNonUniformIdx.ll
@@ -3,6 +3,7 @@
; CHECK-DAG: OpCapability Shader
; CHECK-DAG: OpCapability ShaderNonUniformEXT
+; CHECK-DAG: OpCapability StorageBufferArrayNonUniformIndexingEXT
; CHECK-DAG: OpDecorate {{%[0-9]+}} NonUniformEXT
; CHECK-DAG: OpDecorate {{%[0-9]+}} NonUniformEXT
; CHECK-DAG: OpDecorate {{%[0-9]+}} NonUniformEXT
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageDynIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageConstIdx.ll
index d002097..e4ec231 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageDynIdx.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageConstIdx.ll
@@ -4,8 +4,8 @@
@.str.b0 = private unnamed_addr constant [3 x i8] c"B0\00", align 1
; CHECK-DAG: OpCapability Shader
-; CHECK-DAG: OpCapability StorageImageArrayDynamicIndexing
; CHECK-DAG: OpCapability Image1D
+; CHECK-DAG: OpCapability Int8
; CHECK-NOT: OpCapability
; CHECK-DAG: OpDecorate [[Var:%[0-9]+]] DescriptorSet 3
diff --git a/llvm/test/CodeGen/SPIRV/llvm-compiler-used.ll b/llvm/test/CodeGen/SPIRV/llvm-compiler-used.ll
new file mode 100644
index 0000000..ddc2585
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/llvm-compiler-used.ll
@@ -0,0 +1,19 @@
+; RUN: llc -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+; RUN: llc -verify-machineinstrs -mtriple=spirv-unknown-vulkan %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -mtriple=spirv-unknown-vulkan %s -o - -filetype=obj | spirv-val %}
+
+; Verify that llvm.compiler.used is not lowered.
+; CHECK: OpName %{{[0-9]+}} "unused"
+; CHECK-NOT: OpName %{{[0-9]+}} "llvm.compiler.used"
+
+; Check that the type of llvm.compiler.used is not emitted too.
+; CHECK-NOT: OpTypeArray
+
+@unused = private addrspace(3) global i32 0
+@llvm.compiler.used = appending addrspace(2) global [1 x ptr addrspace (4)] [ptr addrspace(4) addrspacecast (ptr addrspace(3) @unused to ptr addrspace(4))]
+
+define spir_func void @foo() {
+entry:
+ ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fake_use.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fake_use.ll
new file mode 100644
index 0000000..5370b51
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fake_use.ll
@@ -0,0 +1,13 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: OpCapability Addresses
+; CHECK-DAG: OpName %[[#]] "foo"
+
+declare void @llvm.fake.use(...)
+
+define spir_kernel void @foo(ptr addrspace(1) %a) {
+entry:
+ call void (...) @llvm.fake.use(ptr addrspace(1) %a)
+ ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/AtomicCompareExchange_cl20.ll b/llvm/test/CodeGen/SPIRV/transcoding/AtomicCompareExchange_cl20.ll
new file mode 100644
index 0000000..8357373
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/transcoding/AtomicCompareExchange_cl20.ll
@@ -0,0 +1,84 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64v1.2-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-NOT: OpCapability Int64Atomics
+
+; CHECK-DAG: %[[#int:]] = OpTypeInt 32 0
+; CHECK-DAG: %[[#int8:]] = OpTypeInt 8 0
+; CHECK-DAG: %[[#DeviceScope:]] = OpConstant %[[#int]] 1
+; CHECK-DAG: %[[#SequentiallyConsistent_MS:]] = OpConstant %[[#int]] 16
+; CHECK-DAG: %[[#int_ptr:]] = OpTypePointer Generic %[[#int]]
+; CHECK-DAG: %[[#int_ptr8:]] = OpTypePointer Generic %[[#int8]]
+; CHECK-DAG: %[[#bool:]] = OpTypeBool
+
+define spir_func void @test(ptr addrspace(4) %object, ptr addrspace(4) %expected, i32 %desired) {
+
+; CHECK: %[[#object:]] = OpFunctionParameter %[[#int_ptr8]]
+; CHECK: %[[#expected:]] = OpFunctionParameter %[[#int_ptr8]]
+; CHECK: %[[#desired:]] = OpFunctionParameter %[[#int]]
+
+entry:
+ %object.addr = alloca ptr addrspace(4), align 4
+ %expected.addr = alloca ptr addrspace(4), align 4
+ %desired.addr = alloca i32, align 4
+ %strong_res = alloca i8, align 1
+ %res = alloca i8, align 1
+ %weak_res = alloca i8, align 1
+ store ptr addrspace(4) %object, ptr %object.addr, align 4
+ store ptr addrspace(4) %expected, ptr %expected.addr, align 4
+ store i32 %desired, ptr %desired.addr, align 4
+ %0 = load ptr addrspace(4), ptr %object.addr, align 4
+ %1 = load ptr addrspace(4), ptr %expected.addr, align 4
+ %2 = load i32, ptr %desired.addr, align 4
+
+; CHECK-DAG: OpStore %[[#object_addr:]] %[[#object]]
+; CHECK-DAG: OpStore %[[#expected_addr:]] %[[#expected]]
+; CHECK-DAG: OpStore %[[#desired_addr:]] %[[#desired]]
+
+; CHECK: %[[#Pointer:]] = OpLoad %[[#int_ptr]] %[[#]]
+; CHECK: %[[#exp:]] = OpLoad %[[#int_ptr]] %[[#]]
+; CHECK: %[[#Value:]] = OpLoad %[[#int]] %[[#desired_addr]]
+; CHECK: %[[#Comparator:]] = OpLoad %[[#int]] %[[#exp]]
+
+; CHECK: %[[#Result:]] = OpAtomicCompareExchange %[[#int]] %[[#]] %[[#DeviceScope]] %[[#SequentiallyConsistent_MS]] %[[#SequentiallyConsistent_MS]] %[[#Value]] %[[#Comparator]]
+ %call = call spir_func zeroext i1 @_Z30atomic_compare_exchange_strongPVU3AS4U7_AtomiciPU3AS4ii(ptr addrspace(4) %0, ptr addrspace(4) %1, i32 %2)
+
+; CHECK-NEXT: OpStore %[[#exp]] %[[#Result]]
+; CHECK-NEXT: %[[#CallRes:]] = OpIEqual %[[#bool]] %[[#Result]] %[[#Comparator]]
+; CHECK-NOT: %[[#Result]]
+
+ %frombool = zext i1 %call to i8
+ store i8 %frombool, ptr %strong_res, align 1
+ %3 = load i8, ptr %strong_res, align 1
+ %tobool = trunc i8 %3 to i1
+ %lnot = xor i1 %tobool, true
+ %frombool1 = zext i1 %lnot to i8
+ store i8 %frombool1, ptr %res, align 1
+ %4 = load ptr addrspace(4), ptr %object.addr, align 4
+ %5 = load ptr addrspace(4), ptr %expected.addr, align 4
+ %6 = load i32, ptr %desired.addr, align 4
+
+; CHECK: %[[#Pointer:]] = OpLoad %[[#int_ptr]] %[[#]]
+; CHECK: %[[#exp:]] = OpLoad %[[#int_ptr]] %[[#]]
+; CHECK: %[[#Value:]] = OpLoad %[[#int]] %[[#desired_addr]]
+; CHECK: %[[#ComparatorWeak:]] = OpLoad %[[#int]] %[[#exp]]
+
+; CHECK: %[[#Result:]] = OpAtomicCompareExchangeWeak %[[#int]] %[[#]] %[[#DeviceScope]] %[[#SequentiallyConsistent_MS]] %[[#SequentiallyConsistent_MS]] %[[#Value]] %[[#ComparatorWeak]]
+ %call2 = call spir_func zeroext i1 @_Z28atomic_compare_exchange_weakPVU3AS4U7_AtomiciPU3AS4ii(ptr addrspace(4) %4, ptr addrspace(4) %5, i32 %6)
+
+; CHECK-NEXT: OpStore %[[#exp]] %[[#Result]]
+; CHECK-NEXT: %[[#CallRes:]] = OpIEqual %[[#bool]] %[[#Result]] %[[#ComparatorWeak]]
+; CHECK-NOT: %[[#Result]]
+
+ %frombool3 = zext i1 %call2 to i8
+ store i8 %frombool3, ptr %weak_res, align 1
+ %7 = load i8, ptr %weak_res, align 1
+ %tobool4 = trunc i8 %7 to i1
+ %lnot5 = xor i1 %tobool4, true
+ %frombool6 = zext i1 %lnot5 to i8
+ store i8 %frombool6, ptr %res, align 1
+ ret void
+}
+
+declare spir_func zeroext i1 @_Z30atomic_compare_exchange_strongPVU3AS4U7_AtomiciPU3AS4ii(ptr addrspace(4), ptr addrspace(4), i32) #1
+declare spir_func zeroext i1 @_Z28atomic_compare_exchange_weakPVU3AS4U7_AtomiciPU3AS4ii(ptr addrspace(4), ptr addrspace(4), i32) #1
diff --git a/llvm/test/CodeGen/SystemZ/htm-intrinsics.ll b/llvm/test/CodeGen/SystemZ/htm-intrinsics.ll
index c6ee804..07fbed9 100644
--- a/llvm/test/CodeGen/SystemZ/htm-intrinsics.ll
+++ b/llvm/test/CodeGen/SystemZ/htm-intrinsics.ll
@@ -90,7 +90,7 @@ define i32 @test_tbegin_nofloat4(i32 %pad, ptr %ptr) {
; CHECK: tbegin 0, 65292
; CHECK: ipm %r2
; CHECK: srl %r2, 28
-; CHECK: ciblh %r2, 2, 0(%r14)
+; CHECK: bnhr %r14
; CHECK: mvhi 0(%r3), 0
; CHECK: br %r14
%res = call i32 @llvm.s390.tbegin.nofloat(ptr null, i32 65292)
@@ -219,7 +219,7 @@ define i32 @test_tend2(i32 %pad, ptr %ptr) {
; CHECK: tend
; CHECK: ipm %r2
; CHECK: srl %r2, 28
-; CHECK: ciblh %r2, 2, 0(%r14)
+; CHECK: bnhr %r14
; CHECK: mvhi 0(%r3), 0
; CHECK: br %r14
%res = call i32 @llvm.s390.tend()
diff --git a/llvm/test/CodeGen/SystemZ/inline-asm-flag-output-01.ll b/llvm/test/CodeGen/SystemZ/inline-asm-flag-output-01.ll
new file mode 100644
index 0000000..6b8746e
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/inline-asm-flag-output-01.ll
@@ -0,0 +1,738 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -verify-machineinstrs -mtriple=s390x-linux-gnu -O2 | FileCheck %s
+; Test implementation of combining br_ccmask for flag output operand, and
+; optimizing ipm sequence using conditional branches.
+
+declare void @dummy()
+
+; Check a case where the cc is used as an integer.
+; Just (srl (ipm)) sequence without optimization.
+define i32 @test(ptr %a) {
+; CHECK-LABEL: test:
+; CHECK: # %bb.0:
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: ipm %r2
+; CHECK-NEXT: srl %r2, 28
+; CHECK-NEXT: br %r14
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ ret i32 %cc
+}
+
+; Test-1(f1_0_*). Test all 14 valid combinations, where cc is being used for
+; branching.
+
+; Check (cc == 0).
+define void @f1_0_eq_0(ptr %a) {
+; CHECK-LABEL: f1_0_eq_0:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: jge dummy@PLT
+; CHECK-NEXT: .LBB1_1: # %exit
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %cmp = icmp eq i32 %cc, 0
+ br i1 %cmp, label %branch, label %exit
+branch:
+ tail call void @dummy()
+ br label %exit
+exit:
+ ret void
+}
+
+; Check (cc != 0).
+define void @f1_0_ne_0(ptr %a) {
+; CHECK-LABEL: f1_0_ne_0:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: jgne dummy@PLT
+; CHECK-NEXT: .LBB2_1: # %exit
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %cmp = icmp ugt i32 %cc, 0
+ br i1 %cmp, label %branch, label %exit
+branch:
+ tail call void @dummy()
+ br label %exit
+exit:
+ ret void
+}
+
+; Check (cc == 1).
+define void @f1_0_eq_1(ptr %a) {
+; CHECK-LABEL: f1_0_eq_1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: jgl dummy@PLT
+; CHECK-NEXT: .LBB3_1: # %exit
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %cmp = icmp eq i32 %cc, 1
+ br i1 %cmp, label %branch, label %exit
+branch:
+ tail call void @dummy()
+ br label %exit
+exit:
+ ret void
+}
+
+; Check (cc != 1).
+define void @f1_0_ne_1(ptr %a) {
+; CHECK-LABEL: f1_0_ne_1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: jgnl dummy@PLT
+; CHECK-NEXT: .LBB4_1: # %exit
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %cmp = icmp ne i32 %cc, 1
+ br i1 %cmp, label %branch, label %exit
+branch:
+ tail call void @dummy()
+ br label %exit
+exit:
+ ret void
+}
+
+; Check (cc == 2).
+define void @f1_0_eq_2(ptr %a) {
+; CHECK-LABEL: f1_0_eq_2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: jgh dummy@PLT
+; CHECK-NEXT: .LBB5_1: # %exit
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %cmp = icmp eq i32 %cc, 2
+ br i1 %cmp, label %branch, label %exit
+branch:
+ tail call void @dummy()
+ br label %exit
+exit:
+ ret void
+}
+
+; Check (cc != 2).
+define void @f1_0_ne_2(ptr %a) {
+; CHECK-LABEL: f1_0_ne_2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: jgnh dummy@PLT
+; CHECK-NEXT: .LBB6_1: # %exit
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %cmp = icmp ne i32 %cc, 2
+ br i1 %cmp, label %branch, label %exit
+branch:
+ tail call void @dummy()
+ br label %exit
+exit:
+ ret void
+}
+
+; Check (cc == 3).
+define void @f1_0_eq_3(ptr %a) {
+; CHECK-LABEL: f1_0_eq_3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: jgo dummy@PLT
+; CHECK-NEXT: .LBB7_1: # %exit
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %cmp = icmp eq i32 %cc, 3
+ br i1 %cmp, label %branch, label %exit
+branch:
+ tail call void @dummy()
+ br label %exit
+exit:
+ ret void
+}
+
+; Check (cc != 3).
+define void @f1_0_ne_3(ptr %a) {
+; CHECK-LABEL: f1_0_ne_3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: jgno dummy@PLT
+; CHECK-NEXT: .LBB8_1: # %exit
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %cmp = icmp ult i32 %cc, 3
+ br i1 %cmp, label %branch, label %exit
+branch:
+ tail call void @dummy()
+ br label %exit
+exit:
+ ret void
+}
+
+; Check (cc == 0|1).
+define void @f1_0_01(ptr %a) {
+; CHECK-LABEL: f1_0_01:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: jgle dummy@PLT
+; CHECK-NEXT: .LBB9_1: # %exit
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %cmp = icmp ult i32 %cc, 2
+ br i1 %cmp, label %branch, label %exit
+branch:
+ tail call void @dummy()
+ br label %exit
+exit:
+ ret void
+}
+
+; Check (cc == 0|2).
+define void @f1_0_02(ptr %a) {
+; CHECK-LABEL: f1_0_02:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: jghe dummy@PLT
+; CHECK-NEXT: .LBB10_1: # %exit
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %and = and i32 %cc, 1
+ %cmp = icmp eq i32 %and, 0
+ br i1 %cmp, label %branch, label %exit
+branch:
+ tail call void @dummy()
+ br label %exit
+exit:
+ ret void
+}
+
+; Check (cc == 0|3).
+define void @f1_0_03(ptr %a) {
+; CHECK-LABEL: f1_0_03:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: jgnlh dummy@PLT
+; CHECK-NEXT: .LBB11_1: # %exit
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %cmp0 = icmp ne i32 %cc, 0
+ %cmp3 = icmp ne i32 %cc, 3
+ %cmp.inv = and i1 %cmp0, %cmp3
+ br i1 %cmp.inv, label %exit, label %branch
+branch:
+ tail call void @dummy()
+ br label %exit
+exit:
+ ret void
+}
+
+; Check (cc == 1|2).
+define void @f1_0_12(ptr %a) {
+; CHECK-LABEL: f1_0_12:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: jglh dummy@PLT
+; CHECK-NEXT: .LBB12_1: # %exit
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %cmpeq1 = icmp eq i32 %cc, 1
+ %cmpeq2 = icmp eq i32 %cc, 2
+ %cmp = or i1 %cmpeq1, %cmpeq2
+ br i1 %cmp, label %branch, label %exit
+branch:
+ tail call void @dummy()
+ br label %exit
+exit:
+ ret void
+}
+
+; Check (cc == 1|3).
+define void @f1_0_13(ptr %a) {
+; CHECK-LABEL: f1_0_13:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: jgnhe dummy@PLT
+; CHECK-NEXT: .LBB13_1: # %exit
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %cmpeq1 = icmp eq i32 %cc, 1
+ %cmpeq3 = icmp eq i32 %cc, 3
+ %cmp = or i1 %cmpeq1, %cmpeq3
+ br i1 %cmp, label %branch, label %exit
+branch:
+ tail call void @dummy()
+ br label %exit
+exit:
+ ret void
+}
+
+; Check (cc == 2|3).
+define void @f1_0_23(ptr %a) {
+; CHECK-LABEL: f1_0_23:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: jgnle dummy@PLT
+; CHECK-NEXT: .LBB14_1: # %exit
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %cmp = icmp ugt i32 %cc, 1
+ br i1 %cmp, label %branch, label %exit
+branch:
+ tail call void @dummy()
+ br label %exit
+exit:
+ ret void
+}
+
+; Test-2(f1_1_*/f1_2_*/fl_3_*/f1_4_*).
+; Test Mixed patterns involving Binary Ops.
+
+; Check 'add' for (cc != 0).
+define void @f1_1_1(ptr %a) {
+; CHECK-LABEL: f1_1_1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: jgne dummy@PLT
+; CHECK-NEXT: .LBB15_1: # %exit
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %add = add nsw i32 %cc, -1
+ %cmp = icmp ult i32 %add, 3
+ br i1 %cmp, label %branch, label %exit
+branch:
+ tail call void @dummy()
+ br label %exit
+exit:
+ ret void
+}
+
+; Check 'add' for (cc == 1|2).
+define void @f1_1_2(ptr %a) {
+; CHECK-LABEL: f1_1_2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: jglh dummy@PLT
+; CHECK-NEXT: .LBB16_1: # %exit
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %add = add nsw i32 %cc, -1
+ %cmp = icmp ult i32 %add, 2
+ br i1 %cmp, label %branch, label %exit
+branch:
+ tail call void @dummy()
+ br label %exit
+exit:
+ ret void
+}
+
+; Check 'add' for (cc == 1|2).
+define void @f1_1_3(ptr %a) {
+; CHECK-LABEL: f1_1_3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: jglh dummy@PLT
+; CHECK-NEXT: .LBB17_1: # %exit
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %add = add nsw i32 %cc, -3
+ %cmp.inv = icmp ult i32 %add, -2
+ br i1 %cmp.inv, label %exit, label %branch
+branch:
+ tail call void @dummy()
+ br label %exit
+exit:
+ ret void
+}
+
+; Check 'and' with one operand cc and other select_ccmask(cc !=1).
+define void @f1_2_1(ptr %a) {
+; CHECK-LABEL: f1_2_1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: jgnl dummy@PLT
+; CHECK-NEXT: .LBB18_1: # %exit
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %andcc = and i32 %cc, 1
+ %cmpne0 = icmp ne i32 %andcc, 0
+ %cmpne3 = icmp ne i32 %cc, 3
+ %cmp.inv = and i1 %cmpne3, %cmpne0
+ br i1 %cmp.inv, label %exit, label %branch
+branch:
+ tail call void @dummy()
+ br label %exit
+exit:
+ ret void
+}
+
+; Check 'and' with both operands select_ccmask(cc != 2).
+define void @f1_2_2(ptr %a) {
+; CHECK-LABEL: f1_2_2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: jgnh dummy@PLT
+; CHECK-NEXT: .LBB19_1: # %exit
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %ugt1 = icmp samesign ugt i32 %cc, 1
+ %cmpne3 = icmp ne i32 %cc, 3
+ %and.cond.inv = and i1 %ugt1, %cmpne3
+ br i1 %and.cond.inv, label %exit, label %branch
+branch:
+ tail call void @dummy()
+ br label %exit
+exit:
+ ret void
+}
+
+; Check 'and/tm' for (cc == 0|2).
+define void @f1_2_3(ptr %a) {
+; CHECK-LABEL: f1_2_3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: jghe dummy@PLT
+; CHECK-NEXT: .LBB20_1: # %exit
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %and = and i32 %cc, 1
+ %cmp = icmp eq i32 %and, 0
+ br i1 %cmp, label %branch, label %exit
+branch:
+ tail call void @dummy()
+ br label %exit
+exit:
+ ret void
+}
+
+; Check 'and/tm' for (cc == 1|3).
+define void @f1_2_4(ptr %a) {
+; CHECK-LABEL: f1_2_4:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: jgnhe dummy@PLT
+; CHECK-NEXT: .LBB21_1: # %exit
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %and = and i32 %cc, 1
+ %cmp = icmp eq i32 %and, 0
+ br i1 %cmp, label %exit, label %branch
+branch:
+ tail call void @dummy()
+ br label %exit
+exit:
+ ret void
+}
+
+; Check 'icmp' with one operand 'and' and other 'select_ccmask'(cc != 1).
+define void @f1_2_5(ptr %a) {
+; CHECK-LABEL: f1_2_5:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: jgnl dummy@PLT
+; CHECK-NEXT: .LBB22_1: # %exit
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %trunc = trunc i32 %cc to i1
+ %cmpne3 = icmp ne i32 %cc, 3
+ %cmp = xor i1 %cmpne3, %trunc
+ br i1 %cmp, label %branch, label %exit
+branch:
+ tail call void @dummy()
+ br label %exit
+exit:
+ ret void
+}
+
+; Check nested 'xor' cc with select_ccmask(cc != 1).
+define void @f1_3_1(ptr %a) {
+; CHECK-LABEL: f1_3_1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: jgnl dummy@PLT
+; CHECK-NEXT: .LBB23_1: # %exit
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %cmpeq0 = icmp eq i32 %cc, 0
+ %cmpeq2 = icmp eq i32 %cc, 2
+ %xor = xor i1 %cmpeq0, %cmpeq2
+ %cmpne3 = icmp ne i32 %cc, 3
+ %cmp.inv = xor i1 %cmpne3, %xor
+ br i1 %cmp.inv, label %exit, label %branch
+branch:
+ tail call void @dummy()
+ br label %exit
+exit:
+ ret void
+}
+
+; Check branching on 'tm' and 'xor' with one operand cc and the other
+; select_ccmask(cc !=1).
+define void @f1_3_2(ptr %a) {
+; CHECK-LABEL: f1_3_2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: jgnl dummy@PLT
+; CHECK-NEXT: .LBB24_1: # %exit
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %trunc = trunc i32 %cc to i1
+ %cmpeq3 = icmp eq i32 %cc, 3
+ %cmp.inv = xor i1 %cmpeq3, %trunc
+ br i1 %cmp.inv, label %exit, label %branch
+branch:
+ tail call void @dummy()
+ br label %exit
+exit:
+ ret void
+}
+
+; Check branching on 'tm' and 'xor' with one operand cc and the other
+; select_ccmask(cc !=2).
+define void @f1_3_3(ptr %a) {
+; CHECK-LABEL: f1_3_3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: jgnh dummy@PLT
+; CHECK-NEXT: .LBB25_1: # %exit
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %trunc = trunc i32 %cc to i1
+ %cmpne0 = icmp ne i32 %cc, 0
+ %cmp.cond.inv = xor i1 %cmpne0, %trunc
+ br i1 %cmp.cond.inv, label %exit, label %branch
+branch:
+ tail call void @dummy()
+ br label %exit
+exit:
+ ret void
+}
+
+; Check 'or' with both operands are select_ccmask one with TM and other with
+; ICMP(cc == 1).
+define void @f1_4_1(ptr %a) {
+; CHECK-LABEL: f1_4_1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: jgl dummy@PLT
+; CHECK-NEXT: .LBB26_1: # %exit
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %andcc = and i32 %cc, 1
+ %cmpeq0 = icmp eq i32 %andcc, 0
+ %cmpeq3 = icmp eq i32 %cc, 3
+ %cmp.cond.inv = or i1 %cmpeq3, %cmpeq0
+ br i1 %cmp.cond.inv, label %exit, label %branch
+branch:
+ tail call void @dummy()
+ br label %exit
+exit:
+ ret void
+}
+
+; Check 'or' for (cc == 0|1).
+define void @f1_4_2(ptr %a) {
+; CHECK-LABEL: f1_4_2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: jgle dummy@PLT
+; CHECK-NEXT: .LBB27_1: # %exit
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %or = or disjoint i32 %cc, -4
+ %cmp.inv = icmp samesign ugt i32 %or, -3
+ br i1 %cmp.inv, label %exit, label %branch
+branch:
+ tail call void @dummy()
+ br label %exit
+exit:
+ ret void
+}
+
+; Check 'or' for (cc == 0|1).
+define void @f1_4_3(ptr %a) {
+; CHECK-LABEL: f1_4_3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: jgle dummy@PLT
+; CHECK-NEXT: .LBB28_1: # %exit
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %or = or disjoint i32 %cc, -4
+ %cmp = icmp samesign ult i32 %or, -2
+ br i1 %cmp, label %branch, label %exit
+branch:
+ tail call void @dummy()
+ br label %exit
+exit:
+ ret void
+}
+
diff --git a/llvm/test/CodeGen/SystemZ/inline-asm-flag-output-02.ll b/llvm/test/CodeGen/SystemZ/inline-asm-flag-output-02.ll
new file mode 100644
index 0000000..b9b9a4b
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/inline-asm-flag-output-02.ll
@@ -0,0 +1,1665 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -verify-machineinstrs -mtriple=s390x-linux-gnu -O2 | FileCheck %s
+; Test implementation of combining select_ccmask for flag output operand and
+; optimizing ipm sequence using conditional branches.
+
+; Test-1(f2_0_*): Both TrueVal and FalseVal non-const(14-valid CCMask).
+
+; Check (cc == 0).
+define i64 @f2_0_eq_0(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_0_eq_0:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: ber %r14
+; CHECK-NEXT: .LBB0_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %cmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %cmp)
+ %cond = icmp eq i32 %cc, 0
+ %res = select i1 %cond, i64 %x, i64 %y
+ ret i64 %res
+}
+
+; Check (cc != 0).
+define i64 @f2_0_ne_0(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_0_ne_0:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: bner %r14
+; CHECK-NEXT: .LBB1_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %cmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %cmp)
+ %cond = icmp ugt i32 %cc, 0
+ %res = select i1 %cond, i64 %x, i64 %y
+ ret i64 %res
+}
+
+; Check (cc == 1).
+define i64 @f2_0_eq_1(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_0_eq_1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: blr %r14
+; CHECK-NEXT: .LBB2_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %cmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %cmp)
+ %cond = icmp eq i32 %cc, 1
+ %res = select i1 %cond, i64 %x, i64 %y
+ ret i64 %res
+}
+
+; Check (cc != 1).
+define i64 @f2_0_ne_1(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_0_ne_1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: bnlr %r14
+; CHECK-NEXT: .LBB3_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %cmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %cmp)
+ %cond = icmp ne i32 %cc, 1
+ %res = select i1 %cond, i64 %x, i64 %y
+ ret i64 %res
+}
+
+; Check (cc == 2).
+define i64 @f2_0_eq_2(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_0_eq_2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: bhr %r14
+; CHECK-NEXT: .LBB4_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %cmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %cmp)
+ %cond = icmp eq i32 %cc, 2
+ %res = select i1 %cond, i64 %x, i64 %y
+ ret i64 %res
+}
+
+; Check (cc != 2).
+define i64 @f2_0_ne_2(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_0_ne_2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: bnhr %r14
+; CHECK-NEXT: .LBB5_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %cmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %cmp)
+ %cond = icmp ne i32 %cc, 2
+ %res = select i1 %cond, i64 %x, i64 %y
+ ret i64 %res
+}
+
+; Check (cc == 3).
+define i64 @f2_0_eq_3(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_0_eq_3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: bor %r14
+; CHECK-NEXT: .LBB6_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %cmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %cmp)
+ %cond = icmp eq i32 %cc, 3
+ %res = select i1 %cond, i64 %x, i64 %y
+ ret i64 %res
+}
+
+; Check (cc != 3).
+define i64 @f2_0_ne_3(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_0_ne_3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: bnor %r14
+; CHECK-NEXT: .LBB7_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %cmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %cmp)
+ %cond = icmp ult i32 %cc, 3
+ %res = select i1 %cond, i64 %x, i64 %y
+ ret i64 %res
+}
+
+; Check (cc == 0|1).
+define i64 @f2_0_01(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_0_01:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: bler %r14
+; CHECK-NEXT: .LBB8_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %cmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %cmp)
+ %cond = icmp ult i32 %cc, 2
+ %res = select i1 %cond, i64 %x, i64 %y
+ ret i64 %res
+}
+
+; Check (cc == 0|2).
+define i64 @f2_0_02(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_0_02:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: bher %r14
+; CHECK-NEXT: .LBB9_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %cmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %cmp)
+ %and = and i32 %cc, 1
+ %cond = icmp eq i32 %and, 0
+ %res = select i1 %cond, i64 %x, i64 %y
+ ret i64 %res
+}
+
+; Check (cc == 0|3).
+define i64 @f2_0_03(i64 %y, i64 %x, ptr %a) {
+; CHECK-LABEL: f2_0_03:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: blhr %r14
+; CHECK-NEXT: .LBB10_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %cmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %cmp)
+ %cmp0 = icmp ne i32 %cc, 0
+ %cmp3 = icmp ne i32 %cc, 3
+ %cond.inv = and i1 %cmp0, %cmp3
+ %res = select i1 %cond.inv, i64 %y, i64 %x
+ ret i64 %res
+}
+
+; Check (cc == 1|2).
+define i64 @f2_0_12(i64 %y, i64 %x, ptr %a) {
+; CHECK-LABEL: f2_0_12:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: bnlhr %r14
+; CHECK-NEXT: .LBB11_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %cmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %cmp)
+ %add = add nsw i32 %cc, -3
+ %cond.inv = icmp ult i32 %add, -2
+ %res = select i1 %cond.inv, i64 %y, i64 %x
+ ret i64 %res
+}
+
+; Check (cc == 1|3).
+define i64 @f2_0_13(i64 %y, i64 %x, ptr %a) {
+; CHECK-LABEL: f2_0_13:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: bher %r14
+; CHECK-NEXT: .LBB12_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %cmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %cmp)
+ %and = and i32 %cc, 1
+ %cond.inv = icmp eq i32 %and, 0
+ %res = select i1 %cond.inv, i64 %y, i64 %x
+ ret i64 %res
+}
+
+; Check (cc == 2|3).
+define i64 @f2_0_23(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_0_23:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: bnler %r14
+; CHECK-NEXT: .LBB13_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %cmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %cmp)
+ %cond = icmp ugt i32 %cc, 1
+ %res = select i1 %cond, i64 %x, i64 %y
+ ret i64 %res
+}
+
+; Test-2(f2_1_*/f2_2_*/f2_3_*/f2_4_*).
+; Both TrueVal and FalseVal are non-const with mixed patterns involving
+; Binary Ops.
+
+; Check 'add' for (cc != 0).
+define i64 @f2_1_1(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_1_1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: bner %r14
+; CHECK-NEXT: .LBB14_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %add = add nsw i32 %cc, -1
+ %cond = icmp ult i32 %add, 3
+ %res = select i1 %cond, i64 %x, i64 %y
+ ret i64 %res
+}
+
+; Check 'add' for (cc == 1|2).
+define i64 @f2_1_2(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_1_2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: blhr %r14
+; CHECK-NEXT: .LBB15_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %add = add nsw i32 %cc, -1
+ %cond = icmp ult i32 %add, 2
+ %res = select i1 %cond, i64 %x, i64 %y
+ ret i64 %res
+}
+
+; Check 'add' for (cc == 1|2).
+define i64 @f2_1_3(i64 %y, i64 %x, ptr %a) {
+; CHECK-LABEL: f2_1_3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: bnlhr %r14
+; CHECK-NEXT: .LBB16_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %add = add nsw i32 %cc, -3
+ %cond.inv = icmp ult i32 %add, -2
+ %res = select i1 %cond.inv, i64 %y, i64 %x
+ ret i64 %res
+}
+
+; Check 'and' with one operand cc and other select_ccmask(cc !=1).
+define i64 @f2_2_1(i64 %y, i64 %x, ptr %a) {
+; CHECK-LABEL: f2_2_1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: blr %r14
+; CHECK-NEXT: .LBB17_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %andcc = and i32 %cc, 1
+ %cmpne0 = icmp ne i32 %andcc, 0
+ %cmpne3 = icmp ne i32 %cc, 3
+ %cond.inv = and i1 %cmpne3, %cmpne0
+ %res = select i1 %cond.inv, i64 %y, i64 %x
+ ret i64 %res
+}
+
+; Check 'and' with both operands select_ccmask(cc != 2).
+define i64 @f2_2_2(i64 %y, i64 %x, ptr %a) {
+; CHECK-LABEL: f2_2_2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: bhr %r14
+; CHECK-NEXT: .LBB18_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %ugt1 = icmp samesign ugt i32 %cc, 1
+ %cmpne3 = icmp ne i32 %cc, 3
+ %cond.inv = and i1 %ugt1, %cmpne3
+ %res = select i1 %cond.inv, i64 %y, i64 %x
+ ret i64 %res
+}
+
+; Check 'and/tm' for (cc == 0|2).
+define i64 @f2_2_3(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_2_3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: bher %r14
+; CHECK-NEXT: .LBB19_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %and = and i32 %cc, 1
+ %cond = icmp eq i32 %and, 0
+ %res = select i1 %cond, i64 %x, i64 %y
+ ret i64 %res
+}
+
+; Check 'and/tm' for (cc == 1|3).
+define i64 @f2_2_4(i64 %y, i64 %x, ptr %a) {
+; CHECK-LABEL: f2_2_4:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: bher %r14
+; CHECK-NEXT: .LBB20_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %and = and i32 %cc, 1
+ %cond.inv = icmp eq i32 %and, 0
+ %res = select i1 %cond.inv, i64 %y, i64 %x
+ ret i64 %res
+}
+
+; Check 'icmp' with one operand 'and' and other 'select_ccmask'(cc != 1).
+define i64 @f2_2_5(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_2_5:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: bnlr %r14
+; CHECK-NEXT: .LBB21_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %trunc = trunc i32 %cc to i1
+ %cmpne3 = icmp ne i32 %cc, 3
+ %cond = xor i1 %cmpne3, %trunc
+ %res = select i1 %cond, i64 %x, i64 %y
+ ret i64 %res
+}
+
+
+; Check nested 'xor' cc with select_ccmask(cc != 1).
+define i64 @f2_3_1(i64 %y, i64 %x, ptr %a) {
+; CHECK-LABEL: f2_3_1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: blr %r14
+; CHECK-NEXT: .LBB22_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %cmpeq0 = icmp eq i32 %cc, 0
+ %cmpeq2 = icmp eq i32 %cc, 2
+ %xor = xor i1 %cmpeq0, %cmpeq2
+ %cmpne3 = icmp ne i32 %cc, 3
+ %cond.inv = xor i1 %cmpne3, %xor
+ %res = select i1 %cond.inv, i64 %y, i64 %x
+ ret i64 %res
+}
+
+; Check branching on 'tm' and 'xor' with one operand cc and the other
+; select_ccmask(cc !=1).
+define i64 @f2_3_2(i64 %y, i64 %x, ptr %a) {
+; CHECK-LABEL: f2_3_2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: blr %r14
+; CHECK-NEXT: .LBB23_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %trunc = trunc i32 %cc to i1
+ %cmpeq3 = icmp eq i32 %cc, 3
+ %cond.inv = xor i1 %cmpeq3, %trunc
+ %res = select i1 %cond.inv, i64 %y, i64 %x
+ ret i64 %res
+}
+
+; Check branching on 'tm' and 'xor' with one operand cc and the other
+; select_ccmask(cc !=2).
+define i64 @f2_3_3(i64 %y, i64 %x, ptr %a) {
+; CHECK-LABEL: f2_3_3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: bhr %r14
+; CHECK-NEXT: .LBB24_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %trunc = trunc i32 %cc to i1
+ %cmpne0 = icmp ne i32 %cc, 0
+ %cond.inv = xor i1 %cmpne0, %trunc
+ %res = select i1 %cond.inv, i64 %y, i64 %x
+ ret i64 %res
+}
+
+; Check 'or' with both operands select_ccmask with TM and ICMP(cc == 1).
+define i64 @f2_4_1(i64 %y, i64 %x, ptr %a) {
+; CHECK-LABEL: f2_4_1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: bnlr %r14
+; CHECK-NEXT: .LBB25_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %andcc = and i32 %cc, 1
+ %cmpeq0 = icmp eq i32 %andcc, 0
+ %cmpeq3 = icmp eq i32 %cc, 3
+ %cond.inv = or i1 %cmpeq3, %cmpeq0
+ %res = select i1 %cond.inv, i64 %y, i64 %x
+ ret i64 %res
+}
+
+; Check 'or' for (cc == 0|1).
+define i64 @f2_4_2(i64 %y, i64 %x, ptr %a) {
+; CHECK-LABEL: f2_4_2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: bnler %r14
+; CHECK-NEXT: .LBB26_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %or = or disjoint i32 %cc, -4
+ %cond.inv = icmp samesign ugt i32 %or, -3
+ %res = select i1 %cond.inv, i64 %y, i64 %x
+ ret i64 %res
+}
+
+; Check 'or' for (cc == 0|1).
+define i64 @f2_4_3(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_4_3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: bler %r14
+; CHECK-NEXT: .LBB27_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %or = or disjoint i32 %cc, -4
+ %cond = icmp samesign ult i32 %or, -2
+ %res = select i1 %cond, i64 %x, i64 %y
+ ret i64 %res
+}
+
+; Test-3(f3_1_*/f3_2_*/f3_3_*/f3_4_*).
+; TrueVal is non-const and FalseVal is const with mixed patterns involving
+; Binary Ops.
+
+; Check 'add' for (cc != 0).
+define i64 @f3_1_1(i64 %x, ptr %a) {
+; CHECK-LABEL: f3_1_1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: bner %r14
+; CHECK-NEXT: .LBB28_1: # %entry
+; CHECK-NEXT: lghi %r2, 5
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %add = add nsw i32 %cc, -1
+ %cond = icmp ult i32 %add, 3
+ %res = select i1 %cond, i64 %x, i64 5
+ ret i64 %res
+}
+
+; Check 'add' for (cc == 1|2).
+define i64 @f3_1_2(i64 %x, ptr %a) {
+; CHECK-LABEL: f3_1_2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: blhr %r14
+; CHECK-NEXT: .LBB29_1: # %entry
+; CHECK-NEXT: lghi %r2, 5
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %add = add nsw i32 %cc, -1
+ %cond = icmp ult i32 %add, 2
+ %res = select i1 %cond, i64 %x, i64 5
+ ret i64 %res
+}
+
+; Check 'add' for (cc == 1|2).
+define i64 @f3_1_3(ptr %a, i64 %x) {
+; CHECK-LABEL: f3_1_3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: lghi %r2, 5
+; CHECK-NEXT: bnlhr %r14
+; CHECK-NEXT: .LBB30_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %add = add nsw i32 %cc, -3
+ %cond.inv = icmp ult i32 %add, -2
+ %res = select i1 %cond.inv, i64 5, i64 %x
+ ret i64 %res
+}
+
+; Check 'and' with one operand cc and other select_ccmask(cc !=1).
+define i64 @f3_2_1(ptr %a, i64 %x) {
+; CHECK-LABEL: f3_2_1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: lghi %r2, 5
+; CHECK-NEXT: blr %r14
+; CHECK-NEXT: .LBB31_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %andcc = and i32 %cc, 1
+ %cmpne0 = icmp ne i32 %andcc, 0
+ %cmpne3 = icmp ne i32 %cc, 3
+ %cond.inv = and i1 %cmpne3, %cmpne0
+ %res = select i1 %cond.inv, i64 5, i64 %x
+ ret i64 %res
+}
+
+; Check 'and' with both operands select_ccmask(cc != 2).
+define i64 @f3_2_2(ptr %a, i64 %x) {
+; CHECK-LABEL: f3_2_2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: lghi %r2, 5
+; CHECK-NEXT: bhr %r14
+; CHECK-NEXT: .LBB32_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %ugt1 = icmp samesign ugt i32 %cc, 1
+ %cmpne3 = icmp ne i32 %cc, 3
+ %cond.inv = and i1 %ugt1, %cmpne3
+ %res = select i1 %cond.inv, i64 5, i64 %x
+ ret i64 %res
+}
+
+; Check 'and/tm' for (cc == 0|2).
+define i64 @f3_2_3(i64 %x, ptr %a) {
+; CHECK-LABEL: f3_2_3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: bher %r14
+; CHECK-NEXT: .LBB33_1: # %entry
+; CHECK-NEXT: lghi %r2, 5
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %and = and i32 %cc, 1
+ %cond = icmp eq i32 %and, 0
+ %res = select i1 %cond, i64 %x, i64 5
+ ret i64 %res
+}
+
+; Check 'and/tm' for (cc == 1|3).
+define i64 @f3_2_4(ptr %a, i64 %x) {
+; CHECK-LABEL: f3_2_4:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: lghi %r2, 5
+; CHECK-NEXT: bher %r14
+; CHECK-NEXT: .LBB34_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %and = and i32 %cc, 1
+ %cond.inv = icmp eq i32 %and, 0
+ %res = select i1 %cond.inv, i64 5, i64 %x
+ ret i64 %res
+}
+
+; Check 'icmp' with one operand 'and' and other 'select_ccmask'(cc != 1).
+define i64 @f3_2_5(i64 %x, ptr %a) {
+; CHECK-LABEL: f3_2_5:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: bnlr %r14
+; CHECK-NEXT: .LBB35_1: # %entry
+; CHECK-NEXT: lghi %r2, 5
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %trunc = trunc i32 %cc to i1
+ %cmpne3 = icmp ne i32 %cc, 3
+ %cond = xor i1 %cmpne3, %trunc
+ %res = select i1 %cond, i64 %x, i64 5
+ ret i64 %res
+}
+
+
+; Check nested 'xor' cc with select_ccmask(cc != 1).
+define i64 @f3_3_1(ptr %a, i64 %x) {
+; CHECK-LABEL: f3_3_1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: lghi %r2, 5
+; CHECK-NEXT: blr %r14
+; CHECK-NEXT: .LBB36_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %cmpeq0 = icmp eq i32 %cc, 0
+ %cmpeq2 = icmp eq i32 %cc, 2
+ %xor = xor i1 %cmpeq0, %cmpeq2
+ %cmpne3 = icmp ne i32 %cc, 3
+ %cond.inv = xor i1 %cmpne3, %xor
+ %res = select i1 %cond.inv, i64 5, i64 %x
+ ret i64 %res
+}
+
+; Check branching on 'tm' and 'xor' with one operand cc and the other
+; select_ccmask(cc !=1).
+define i64 @f3_3_2(ptr %a, i64 %x) {
+; CHECK-LABEL: f3_3_2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: lghi %r2, 5
+; CHECK-NEXT: blr %r14
+; CHECK-NEXT: .LBB37_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %trunc = trunc i32 %cc to i1
+ %cmpeq3 = icmp eq i32 %cc, 3
+ %cond.inv = xor i1 %cmpeq3, %trunc
+ %res = select i1 %cond.inv, i64 5, i64 %x
+ ret i64 %res
+}
+
+; Check branching on 'tm' and 'xor' with one operand cc and the other
+; select_ccmask(cc !=2).
+define i64 @f3_3_3(ptr %a, i64 %x) {
+; CHECK-LABEL: f3_3_3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: lghi %r2, 5
+; CHECK-NEXT: bhr %r14
+; CHECK-NEXT: .LBB38_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %trunc = trunc i32 %cc to i1
+ %cmpne0 = icmp ne i32 %cc, 0
+ %cond.inv = xor i1 %cmpne0, %trunc
+ %res = select i1 %cond.inv, i64 5, i64 %x
+ ret i64 %res
+}
+
+; Check 'or' with both operands select_ccmask with TM and ICMP(cc == 1).
+define i64 @f3_4_1(ptr %a, i64 %x) {
+; CHECK-LABEL: f3_4_1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: lghi %r2, 5
+; CHECK-NEXT: bnlr %r14
+; CHECK-NEXT: .LBB39_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %andcc = and i32 %cc, 1
+ %cmpeq0 = icmp eq i32 %andcc, 0
+ %cmpeq3 = icmp eq i32 %cc, 3
+ %cond.inv = or i1 %cmpeq3, %cmpeq0
+ %res = select i1 %cond.inv, i64 5, i64 %x
+ ret i64 %res
+}
+
+; Check 'or' for (cc == 0|1).
+define i64 @f3_4_2(ptr %a, i64 %x) {
+; CHECK-LABEL: f3_4_2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: lghi %r2, 5
+; CHECK-NEXT: bnler %r14
+; CHECK-NEXT: .LBB40_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %or = or disjoint i32 %cc, -4
+ %cond.inv = icmp samesign ugt i32 %or, -3
+ %res = select i1 %cond.inv, i64 5, i64 %x
+ ret i64 %res
+}
+
+; Check 'or' for (cc == 0|1).
+define i64 @f3_4_3(i64 %x, ptr %a) {
+; CHECK-LABEL: f3_4_3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: bler %r14
+; CHECK-NEXT: .LBB41_1: # %entry
+; CHECK-NEXT: lghi %r2, 5
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %or = or disjoint i32 %cc, -4
+ %cond = icmp samesign ult i32 %or, -2
+ %res = select i1 %cond, i64 %x, i64 5
+ ret i64 %res
+}
+
+
+; Test-4(f4_1_*/f4_2_*/f4_3_*/f4_4_*).
+; TrueVal is const and FalseVal is non-const with mixed patterns involving
+; Binary Ops.
+
+; Check 'add' for (cc != 0).
+define i64 @f4_1_1(ptr %a, i64 %y) {
+; CHECK-LABEL: f4_1_1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: lghi %r2, 15
+; CHECK-NEXT: bner %r14
+; CHECK-NEXT: .LBB42_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %add = add nsw i32 %cc, -1
+ %cond = icmp ult i32 %add, 3
+ %res = select i1 %cond, i64 15, i64 %y
+ ret i64 %res
+}
+
+; Check 'add' for (cc == 1|2).
+define i64 @f4_1_2(ptr %a, i64 %y) {
+; CHECK-LABEL: f4_1_2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: lghi %r2, 15
+; CHECK-NEXT: blhr %r14
+; CHECK-NEXT: .LBB43_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %add = add nsw i32 %cc, -1
+ %cond = icmp ult i32 %add, 2
+ %res = select i1 %cond, i64 15, i64 %y
+ ret i64 %res
+}
+
+; Check 'add' for (cc == 1|2).
+define i64 @f4_1_3(i64 %y, ptr %a) {
+; CHECK-LABEL: f4_1_3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: bnlhr %r14
+; CHECK-NEXT: .LBB44_1: # %entry
+; CHECK-NEXT: lghi %r2, 15
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %add = add nsw i32 %cc, -3
+ %cond.inv = icmp ult i32 %add, -2
+ %res = select i1 %cond.inv, i64 %y, i64 15
+ ret i64 %res
+}
+
+; Check 'and' with one operand cc and other select_ccmask(cc !=1).
+define i64 @f4_2_1(i64 %y, ptr %a) {
+; CHECK-LABEL: f4_2_1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: blr %r14
+; CHECK-NEXT: .LBB45_1: # %entry
+; CHECK-NEXT: lghi %r2, 15
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %andcc = and i32 %cc, 1
+ %cmpne0 = icmp ne i32 %andcc, 0
+ %cmpne3 = icmp ne i32 %cc, 3
+ %cond.inv = and i1 %cmpne3, %cmpne0
+ %res = select i1 %cond.inv, i64 %y, i64 15
+ ret i64 %res
+}
+
+; Check 'and' with both operands select_ccmask(cc != 2).
+define i64 @f4_2_2(i64 %y, ptr %a) {
+; CHECK-LABEL: f4_2_2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: bhr %r14
+; CHECK-NEXT: .LBB46_1: # %entry
+; CHECK-NEXT: lghi %r2, 15
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %ugt1 = icmp samesign ugt i32 %cc, 1
+ %cmpne3 = icmp ne i32 %cc, 3
+ %cond.inv = and i1 %ugt1, %cmpne3
+ %res = select i1 %cond.inv, i64 %y, i64 15
+ ret i64 %res
+}
+
+; Check 'and/tm' for (cc == 0|2).
+define i64 @f4_2_3(ptr %a, i64 %y) {
+; CHECK-LABEL: f4_2_3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: lghi %r2, 15
+; CHECK-NEXT: bher %r14
+; CHECK-NEXT: .LBB47_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %and = and i32 %cc, 1
+ %cond = icmp eq i32 %and, 0
+ %res = select i1 %cond, i64 15, i64 %y
+ ret i64 %res
+}
+
+; Check 'and/tm' for (cc == 1|3).
+define i64 @f4_2_4(i64 %y, ptr %a) {
+; CHECK-LABEL: f4_2_4:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: bher %r14
+; CHECK-NEXT: .LBB48_1: # %entry
+; CHECK-NEXT: lghi %r2, 15
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %and = and i32 %cc, 1
+ %cond.inv = icmp eq i32 %and, 0
+ %res = select i1 %cond.inv, i64 %y, i64 15
+ ret i64 %res
+}
+
+; Check 'icmp' with one operand 'and' and other 'select_ccmask'(cc != 1).
+define i64 @f4_2_5(ptr %a, i64 %y) {
+; CHECK-LABEL: f4_2_5:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: lghi %r2, 15
+; CHECK-NEXT: bnlr %r14
+; CHECK-NEXT: .LBB49_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %trunc = trunc i32 %cc to i1
+ %cmpne3 = icmp ne i32 %cc, 3
+ %cond = xor i1 %cmpne3, %trunc
+ %res = select i1 %cond, i64 15, i64 %y
+ ret i64 %res
+}
+
+
+; Check nested 'xor' cc with select_ccmask(cc != 1).
+define i64 @f4_3_1(i64 %y, ptr %a) {
+; CHECK-LABEL: f4_3_1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: blr %r14
+; CHECK-NEXT: .LBB50_1: # %entry
+; CHECK-NEXT: lghi %r2, 15
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %cmpeq0 = icmp eq i32 %cc, 0
+ %cmpeq2 = icmp eq i32 %cc, 2
+ %xor = xor i1 %cmpeq0, %cmpeq2
+ %cmpne3 = icmp ne i32 %cc, 3
+ %cond.inv = xor i1 %cmpne3, %xor
+ %res = select i1 %cond.inv, i64 %y, i64 15
+ ret i64 %res
+}
+
+; Check branching on 'tm' and 'xor' with one operand cc and the other
+; select_ccmask(cc !=1).
+define i64 @f4_3_2(i64 %y, ptr %a) {
+; CHECK-LABEL: f4_3_2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: blr %r14
+; CHECK-NEXT: .LBB51_1: # %entry
+; CHECK-NEXT: lghi %r2, 15
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %trunc = trunc i32 %cc to i1
+ %cmpeq3 = icmp eq i32 %cc, 3
+ %cond.inv = xor i1 %cmpeq3, %trunc
+ %res = select i1 %cond.inv, i64 %y, i64 15
+ ret i64 %res
+}
+
+; Check branching on 'tm' and 'xor' with one operand cc and the other
+; select_ccmask(cc !=2).
+define i64 @f4_3_3(i64 %y, ptr %a) {
+; CHECK-LABEL: f4_3_3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: bhr %r14
+; CHECK-NEXT: .LBB52_1: # %entry
+; CHECK-NEXT: lghi %r2, 15
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %trunc = trunc i32 %cc to i1
+ %cmpne0 = icmp ne i32 %cc, 0
+ %cond.inv = xor i1 %cmpne0, %trunc
+ %res = select i1 %cond.inv, i64 %y, i64 15
+ ret i64 %res
+}
+
+; Check 'or' with both operands select_ccmask with TM and ICMP(cc == 1).
+define i64 @f4_4_1(i64 %y,ptr %a) {
+; CHECK-LABEL: f4_4_1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: bnlr %r14
+; CHECK-NEXT: .LBB53_1: # %entry
+; CHECK-NEXT: lghi %r2, 15
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %andcc = and i32 %cc, 1
+ %cmpeq0 = icmp eq i32 %andcc, 0
+ %cmpeq3 = icmp eq i32 %cc, 3
+ %cond.inv = or i1 %cmpeq3, %cmpeq0
+ %res = select i1 %cond.inv, i64 %y, i64 15
+ ret i64 %res
+}
+
+; Check 'or' for (cc == 0|1).
+define i64 @f4_4_2(i64 %y, ptr %a) {
+; CHECK-LABEL: f4_4_2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: bnler %r14
+; CHECK-NEXT: .LBB54_1: # %entry
+; CHECK-NEXT: lghi %r2, 15
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %or = or disjoint i32 %cc, -4
+ %cond.inv = icmp samesign ugt i32 %or, -3
+ %res = select i1 %cond.inv, i64 %y, i64 15
+ ret i64 %res
+}
+
+; Check 'or' for (cc == 0|1).
+define i64 @f4_4_3(ptr %a, i64 %y) {
+; CHECK-LABEL: f4_4_3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: lghi %r2, 15
+; CHECK-NEXT: bler %r14
+; CHECK-NEXT: .LBB55_1: # %entry
+; CHECK-NEXT: lgr %r2, %r3
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %or = or disjoint i32 %cc, -4
+ %cond = icmp samesign ult i32 %or, -2
+ %res = select i1 %cond, i64 15, i64 %y
+ ret i64 %res
+}
+
+; Test-5(f5_1_*/f5_2_*/f5_3_*/f5_4_*).
+; Both TrueVal and FalseVal are const with mixed patterns involving
+; Binary Ops.
+
+
+; Check 'add' for (cc != 0).
+define i64 @f5_1_1(ptr %a) {
+; CHECK-LABEL: f5_1_1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: lghi %r2, 15
+; CHECK-NEXT: bner %r14
+; CHECK-NEXT: .LBB56_1: # %entry
+; CHECK-NEXT: lghi %r2, 5
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %add = add nsw i32 %cc, -1
+ %cond = icmp ult i32 %add, 3
+ %res = select i1 %cond, i64 15, i64 5
+ ret i64 %res
+}
+
+; Check 'add' for (cc == 1|2).
+define i64 @f5_1_2(ptr %a) {
+; CHECK-LABEL: f5_1_2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: lghi %r2, 15
+; CHECK-NEXT: blhr %r14
+; CHECK-NEXT: .LBB57_1: # %entry
+; CHECK-NEXT: lghi %r2, 5
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %add = add nsw i32 %cc, -1
+ %cond = icmp ult i32 %add, 2
+ %res = select i1 %cond, i64 15, i64 5
+ ret i64 %res
+}
+
+; Check 'add' for (cc == 1|2).
+define i64 @f5_1_3(ptr %a) {
+; CHECK-LABEL: f5_1_3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: lghi %r2, 5
+; CHECK-NEXT: bnlhr %r14
+; CHECK-NEXT: .LBB58_1: # %entry
+; CHECK-NEXT: lghi %r2, 15
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %add = add nsw i32 %cc, -3
+ %cond.inv = icmp ult i32 %add, -2
+ %res = select i1 %cond.inv, i64 5, i64 15
+ ret i64 %res
+}
+
+; Check 'and' with one operand cc and other select_ccmask(cc !=1).
+define i64 @f5_2_1(ptr %a) {
+; CHECK-LABEL: f5_2_1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: lghi %r2, 5
+; CHECK-NEXT: blr %r14
+; CHECK-NEXT: .LBB59_1: # %entry
+; CHECK-NEXT: lghi %r2, 15
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %andcc = and i32 %cc, 1
+ %cmpne0 = icmp ne i32 %andcc, 0
+ %cmpne3 = icmp ne i32 %cc, 3
+ %cond.inv = and i1 %cmpne3, %cmpne0
+ %res = select i1 %cond.inv, i64 5, i64 15
+ ret i64 %res
+}
+
+; Check 'and' with both operands select_ccmask(cc != 2).
+define i64 @f5_2_2(ptr %a) {
+; CHECK-LABEL: f5_2_2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: lghi %r2, 5
+; CHECK-NEXT: bhr %r14
+; CHECK-NEXT: .LBB60_1: # %entry
+; CHECK-NEXT: lghi %r2, 15
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %ugt1 = icmp samesign ugt i32 %cc, 1
+ %cmpne3 = icmp ne i32 %cc, 3
+ %cond.inv = and i1 %ugt1, %cmpne3
+ %res = select i1 %cond.inv, i64 5, i64 15
+ ret i64 %res
+}
+
+; Check 'and/tm' for (cc == 0|2).
+define i64 @f5_2_3(ptr %a) {
+; CHECK-LABEL: f5_2_3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: lghi %r2, 15
+; CHECK-NEXT: bher %r14
+; CHECK-NEXT: .LBB61_1: # %entry
+; CHECK-NEXT: lghi %r2, 5
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %and = and i32 %cc, 1
+ %cond = icmp eq i32 %and, 0
+ %res = select i1 %cond, i64 15, i64 5
+ ret i64 %res
+}
+
+; Check 'and/tm' for (cc == 1|3).
+define i64 @f5_2_4(ptr %a) {
+; CHECK-LABEL: f5_2_4:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: lghi %r2, 5
+; CHECK-NEXT: bher %r14
+; CHECK-NEXT: .LBB62_1: # %entry
+; CHECK-NEXT: lghi %r2, 15
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %and = and i32 %cc, 1
+ %cond.inv = icmp eq i32 %and, 0
+ %res = select i1 %cond.inv, i64 5, i64 15
+ ret i64 %res
+}
+
+; Check 'icmp' with one operand 'and' and other 'select_ccmask'(cc != 1).
+define i64 @f5_2_5(ptr %a) {
+; CHECK-LABEL: f5_2_5:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: lghi %r2, 15
+; CHECK-NEXT: bnlr %r14
+; CHECK-NEXT: .LBB63_1: # %entry
+; CHECK-NEXT: lghi %r2, 5
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %trunc = trunc i32 %cc to i1
+ %cmpne3 = icmp ne i32 %cc, 3
+ %cond = xor i1 %cmpne3, %trunc
+ %res = select i1 %cond, i64 15, i64 5
+ ret i64 %res
+}
+
+
+; Check nested 'xor' cc with select_ccmask(cc != 1).
+define i64 @f5_3_1(ptr %a) {
+; CHECK-LABEL: f5_3_1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: lghi %r2, 5
+; CHECK-NEXT: blr %r14
+; CHECK-NEXT: .LBB64_1: # %entry
+; CHECK-NEXT: lghi %r2, 15
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %cmpeq0 = icmp eq i32 %cc, 0
+ %cmpeq2 = icmp eq i32 %cc, 2
+ %xor = xor i1 %cmpeq0, %cmpeq2
+ %cmpne3 = icmp ne i32 %cc, 3
+ %cond.inv = xor i1 %cmpne3, %xor
+ %res = select i1 %cond.inv, i64 5, i64 15
+ ret i64 %res
+}
+
+; Check branching on 'tm' and 'xor' with one operand cc and the other
+; select_ccmask(cc !=1).
+define i64 @f5_3_2(ptr %a) {
+; CHECK-LABEL: f5_3_2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: lghi %r2, 5
+; CHECK-NEXT: blr %r14
+; CHECK-NEXT: .LBB65_1: # %entry
+; CHECK-NEXT: lghi %r2, 15
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %trunc = trunc i32 %cc to i1
+ %cmpeq3 = icmp eq i32 %cc, 3
+ %cond.inv = xor i1 %cmpeq3, %trunc
+ %res = select i1 %cond.inv, i64 5, i64 15
+ ret i64 %res
+}
+
+; Check branching on 'tm' and 'xor' with one operand cc and the other
+; select_ccmask(cc !=2).
+define i64 @f5_3_3(ptr %a) {
+; CHECK-LABEL: f5_3_3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: lghi %r2, 5
+; CHECK-NEXT: bhr %r14
+; CHECK-NEXT: .LBB66_1: # %entry
+; CHECK-NEXT: lghi %r2, 15
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %trunc = trunc i32 %cc to i1
+ %cmpne0 = icmp ne i32 %cc, 0
+ %cond.inv = xor i1 %cmpne0, %trunc
+ %res = select i1 %cond.inv, i64 5, i64 15
+ ret i64 %res
+}
+
+; Check 'or' with both operands select_ccmask with TM and ICMP(cc == 1).
+define i64 @f5_4_1(ptr %a) {
+; CHECK-LABEL: f5_4_1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: lghi %r2, 5
+; CHECK-NEXT: bnlr %r14
+; CHECK-NEXT: .LBB67_1: # %entry
+; CHECK-NEXT: lghi %r2, 15
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %andcc = and i32 %cc, 1
+ %cmpeq0 = icmp eq i32 %andcc, 0
+ %cmpeq3 = icmp eq i32 %cc, 3
+ %cond.inv = or i1 %cmpeq3, %cmpeq0
+ %res = select i1 %cond.inv, i64 5, i64 15
+ ret i64 %res
+}
+
+; Check 'or' for (cc == 0|1).
+define i64 @f5_4_2(ptr %a) {
+; CHECK-LABEL: f5_4_2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: lghi %r2, 5
+; CHECK-NEXT: bnler %r14
+; CHECK-NEXT: .LBB68_1: # %entry
+; CHECK-NEXT: lghi %r2, 15
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %or = or disjoint i32 %cc, -4
+ %cond.inv = icmp samesign ugt i32 %or, -3
+ %res = select i1 %cond.inv, i64 5, i64 15
+ ret i64 %res
+}
+
+; Check 'or' for (cc == 0|1).
+define i64 @f5_4_3(ptr %a) {
+; CHECK-LABEL: f5_4_3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: lghi %r2, 15
+; CHECK-NEXT: bler %r14
+; CHECK-NEXT: .LBB69_1: # %entry
+; CHECK-NEXT: lghi %r2, 5
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %tmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %tmp)
+ %or = or disjoint i32 %cc, -4
+ %cond = icmp samesign ult i32 %or, -2
+ %res = select i1 %cond, i64 15, i64 5
+ ret i64 %res
+}
+
+; Nested select_ccmask with TrueVal and FalseVal swapped with each other.
+define i64 @f6_1(ptr %a) {
+; CHECK-LABEL: f6_1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: #APP
+; CHECK-NEXT: alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: lghi %r2, 15
+; CHECK-NEXT: bher %r14
+; CHECK-NEXT: .LBB70_1: # %entry
+; CHECK-NEXT: lghi %r2, 5
+; CHECK-NEXT: br %r14
+entry:
+ %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a)
+ %cmp = icmp ult i32 %cc, 4
+ tail call void @llvm.assume(i1 %cmp)
+ %andcc = and i32 %cc, 1
+ %cmpeq0 = icmp eq i32 %andcc, 0
+ %cmpeq3 = icmp eq i32 %cc, 3
+ %select = select i1 %cmpeq3, i64 5, i64 15
+ %res = select i1 %cmpeq0, i64 %select, i64 5
+ ret i64 %res
+}
+
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll
index c2b4494..11e7e5c 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll
@@ -1,16 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve,+lob %s -S -o - | FileCheck %s
-; CHECK-LABEL: mul_v16i8
-; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1
-; CHECK: vector.body:
-; CHECK: %index = phi i32
-; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <16 x i1> @llvm.arm.mve.vctp8(i32 [[ELEMS]])
-; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 16
-; CHECK: [[LD0:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef)
-; CHECK: [[LD1:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef)
-; CHECK: tail call void @llvm.masked.store.v16i8.p0(<16 x i8> {{.*}}, ptr {{.*}}, i32 4, <16 x i1> [[VCTP]])
define dso_local arm_aapcs_vfpcc void @mul_v16i8(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
+; CHECK-LABEL: define dso_local arm_aapcs_vfpcc void @mul_v16i8(
+; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias captures(none) [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[N]], 15
+; CHECK-NEXT: [[TMP9:%.*]] = lshr i32 [[TMP8]], 4
+; CHECK-NEXT: [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 4
+; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], -16
+; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 4
+; CHECK-NEXT: [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1
+; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]])
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 [[INDEX]]
+; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.vctp8(i32 [[TMP0]])
+; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 16
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP]], i32 4, <16 x i1> [[TMP1]], <16 x i8> undef)
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[INDEX]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP3]], i32 4, <16 x i1> [[TMP1]], <16 x i8> undef)
+; CHECK-NEXT: [[MUL:%.*]] = mul nsw <16 x i8> [[WIDE_MASKED_LOAD2]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[C]], i32 [[INDEX]]
+; CHECK-NEXT: tail call void @llvm.masked.store.v16i8.p0(<16 x i8> [[MUL]], ptr [[TMP6]], i32 4, <16 x i1> [[TMP1]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16
+; CHECK-NEXT: [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1)
+; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+; CHECK-NEXT: br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: ret void
+;
entry:
%cmp8 = icmp eq i32 %N, 0
%tmp8 = add i32 %N, 15
@@ -45,17 +70,41 @@ for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
-; CHECK-LABEL: mul_v8i16
-; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1
-; CHECK: vector.body:
-; CHECK: %index = phi i32
-; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[ELEMS]])
-; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 8
-; CHECK: [[LD0:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
-; CHECK: [[LD1:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
-; CHECK: tail call void @llvm.masked.store.v8i16.p0(<8 x i16> {{.*}}, ptr {{.*}}, i32 4, <8 x i1> [[VCTP]])
define dso_local arm_aapcs_vfpcc void @mul_v8i16(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
+; CHECK-LABEL: define dso_local arm_aapcs_vfpcc void @mul_v8i16(
+; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias captures(none) [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[N]], 7
+; CHECK-NEXT: [[TMP9:%.*]] = lshr i32 [[TMP8]], 3
+; CHECK-NEXT: [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 3
+; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], -8
+; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 3
+; CHECK-NEXT: [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1
+; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]])
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i16, ptr [[A]], i32 [[INDEX]]
+; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP0]])
+; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 8
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP]], i32 4, <8 x i1> [[TMP1]], <8 x i16> undef)
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[INDEX]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP1]], <8 x i16> undef)
+; CHECK-NEXT: [[MUL:%.*]] = mul nsw <8 x i16> [[WIDE_MASKED_LOAD2]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[C]], i32 [[INDEX]]
+; CHECK-NEXT: tail call void @llvm.masked.store.v8i16.p0(<8 x i16> [[MUL]], ptr [[TMP6]], i32 4, <8 x i1> [[TMP1]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; CHECK-NEXT: [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1)
+; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+; CHECK-NEXT: br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: ret void
+;
entry:
%cmp8 = icmp eq i32 %N, 0
%tmp8 = add i32 %N, 7
@@ -90,16 +139,41 @@ for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
-; CHECK-LABEL: mul_v4i32
-; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1
-; CHECK: vector.body:
-; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]])
-; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
-; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
-; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
-; CHECK: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> {{.*}}, ptr {{.*}}, i32 4, <4 x i1> [[VCTP]])
define dso_local arm_aapcs_vfpcc void @mul_v4i32(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
+; CHECK-LABEL: define dso_local arm_aapcs_vfpcc void @mul_v4i32(
+; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias captures(none) [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[N]], 3
+; CHECK-NEXT: [[TMP9:%.*]] = lshr i32 [[TMP8]], 2
+; CHECK-NEXT: [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 2
+; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], -4
+; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 2
+; CHECK-NEXT: [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1
+; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]])
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]]
+; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
+; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 4
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP3]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT: [[MUL:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD2]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[INDEX]]
+; CHECK-NEXT: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[MUL]], ptr [[TMP6]], i32 4, <4 x i1> [[TMP1]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT: [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1)
+; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+; CHECK-NEXT: br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: ret void
+;
entry:
%cmp8 = icmp eq i32 %N, 0
%tmp8 = add i32 %N, 3
@@ -134,17 +208,47 @@ for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
-; CHECK-LABEL: split_vector
-; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1
-; CHECK: vector.body:
-; CHECK: %index = phi i32
-; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]])
-; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
-; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
-; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
-; CHECK: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> {{.*}}, ptr {{.*}}, i32 4, <4 x i1> [[VCTP]])
define dso_local arm_aapcs_vfpcc void @split_vector(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
+; CHECK-LABEL: define dso_local arm_aapcs_vfpcc void @split_vector(
+; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias captures(none) [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[N]], 3
+; CHECK-NEXT: [[TMP9:%.*]] = lshr i32 [[TMP8]], 2
+; CHECK-NEXT: [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 2
+; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], -4
+; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 2
+; CHECK-NEXT: [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1
+; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]])
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]]
+; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
+; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 4
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT: [[EXTRACT_1_LOW:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[EXTRACT_1_HIGH:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP3]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT: [[EXTRACT_2_LOW:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_LOAD2]], <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[EXTRACT_2_HIGH:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_LOAD2]], <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT: [[MUL:%.*]] = mul nsw <2 x i32> [[EXTRACT_1_LOW]], [[EXTRACT_2_LOW]]
+; CHECK-NEXT: [[SUB:%.*]] = sub nsw <2 x i32> [[EXTRACT_1_HIGH]], [[EXTRACT_2_HIGH]]
+; CHECK-NEXT: [[COMBINE:%.*]] = shufflevector <2 x i32> [[MUL]], <2 x i32> [[SUB]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[INDEX]]
+; CHECK-NEXT: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[COMBINE]], ptr [[TMP6]], i32 4, <4 x i1> [[TMP1]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT: [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1)
+; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+; CHECK-NEXT: br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: ret void
+;
entry:
%cmp8 = icmp eq i32 %N, 0
%tmp8 = add i32 %N, 3
@@ -186,14 +290,48 @@ for.cond.cleanup: ; preds = %vector.body, %entry
}
; One of the loads now uses ult predicate.
-; CHECK-LABEL: mismatch_load_pred
-; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]])
-; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
-; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
-; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> %wrong, <4 x i32> undef)
-; CHECK: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> {{.*}}, ptr {{.*}}, i32 4, <4 x i1> [[VCTP]])
define dso_local arm_aapcs_vfpcc void @mismatch_load_pred(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
+; CHECK-LABEL: define dso_local arm_aapcs_vfpcc void @mismatch_load_pred(
+; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias captures(none) [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[N]], 3
+; CHECK-NEXT: [[TMP9:%.*]] = lshr i32 [[TMP8]], 2
+; CHECK-NEXT: [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 2
+; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], -4
+; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 2
+; CHECK-NEXT: [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1
+; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
+; CHECK-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT10]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]])
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]]
+; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
+; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 4
+; CHECK-NEXT: [[WRONG:%.*]] = icmp ult <4 x i32> [[INDUCTION]], [[BROADCAST_SPLAT11]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP3]], i32 4, <4 x i1> [[WRONG]], <4 x i32> undef)
+; CHECK-NEXT: [[TMP5:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD12]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[INDEX]]
+; CHECK-NEXT: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP5]], ptr [[TMP6]], i32 4, <4 x i1> [[TMP1]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT: [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1)
+; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+; CHECK-NEXT: br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: ret void
+;
entry:
%cmp8 = icmp eq i32 %N, 0
%tmp8 = add i32 %N, 3
@@ -236,17 +374,48 @@ for.cond.cleanup: ; preds = %vector.body, %entry
}
; The store now uses ult predicate.
-; CHECK-LABEL: mismatch_store_pred
-; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1
-; CHECK: vector.body:
-; CHECK: %index = phi i32
-; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]])
-; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
-; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
-; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
-; CHECK: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> {{.*}}, ptr {{.*}}, i32 4, <4 x i1> %wrong)
define dso_local arm_aapcs_vfpcc void @mismatch_store_pred(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
+; CHECK-LABEL: define dso_local arm_aapcs_vfpcc void @mismatch_store_pred(
+; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias captures(none) [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[N]], 3
+; CHECK-NEXT: [[TMP9:%.*]] = lshr i32 [[TMP8]], 2
+; CHECK-NEXT: [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 2
+; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], -4
+; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 2
+; CHECK-NEXT: [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1
+; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
+; CHECK-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT10]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]])
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]]
+; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
+; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 4
+; CHECK-NEXT: [[WRONG:%.*]] = icmp ult <4 x i32> [[INDUCTION]], [[BROADCAST_SPLAT11]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP3]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT: [[TMP5:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD12]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[INDEX]]
+; CHECK-NEXT: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP5]], ptr [[TMP6]], i32 4, <4 x i1> [[WRONG]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT: [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1)
+; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+; CHECK-NEXT: br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: ret void
+;
entry:
%cmp8 = icmp eq i32 %N, 0
%tmp8 = add i32 %N, 3
@@ -294,14 +463,72 @@ for.cond.cleanup: ; preds = %vector.body, %entry
;
; Step value 16 doesn't match vector width 4
;
-; CHECK-LABEL: interleave4
-; CHECK: vector.body:
-; CHECK: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %N)
-; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %N)
-; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %N)
-;
define dso_local void @interleave4(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @interleave4(
+; CHECK-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias readonly captures(none) [[C:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT: [[V0:%.*]] = add i32 [[N]], 15
+; CHECK-NEXT: [[V1:%.*]] = lshr i32 [[V0]], 4
+; CHECK-NEXT: [[V2:%.*]] = shl nuw i32 [[V1]], 4
+; CHECK-NEXT: [[V3:%.*]] = add i32 [[V2]], -16
+; CHECK-NEXT: [[V4:%.*]] = lshr i32 [[V3]], 4
+; CHECK-NEXT: [[V5:%.*]] = add nuw nsw i32 [[V4]], 1
+; CHECK-NEXT: br i1 [[CMP8]], label %[[VECTOR_PH:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, ptr [[A]], i32 8
+; CHECK-NEXT: [[SCEVGEP30:%.*]] = getelementptr i32, ptr [[C]], i32 8
+; CHECK-NEXT: [[SCEVGEP37:%.*]] = getelementptr i32, ptr [[B]], i32 8
+; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[V5]])
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[LSR_IV38:%.*]] = phi ptr [ [[SCEVGEP39:%.*]], %[[VECTOR_BODY]] ], [ [[SCEVGEP37]], %[[VECTOR_PH]] ]
+; CHECK-NEXT: [[LSR_IV31:%.*]] = phi ptr [ [[SCEVGEP32:%.*]], %[[VECTOR_BODY]] ], [ [[SCEVGEP30]], %[[VECTOR_PH]] ]
+; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP25:%.*]], %[[VECTOR_BODY]] ], [ [[SCEVGEP]], %[[VECTOR_PH]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[V14:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[V6:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[V15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
+; CHECK-NEXT: [[V7:%.*]] = add i32 [[INDEX]], 4
+; CHECK-NEXT: [[ACTIVE_LANE_MASK15:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[V7]], i32 [[N]])
+; CHECK-NEXT: [[V8:%.*]] = add i32 [[V7]], 4
+; CHECK-NEXT: [[ACTIVE_LANE_MASK16:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[V8]], i32 [[N]])
+; CHECK-NEXT: [[V9:%.*]] = add i32 [[V8]], 4
+; CHECK-NEXT: [[ACTIVE_LANE_MASK17:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[V9]], i32 [[N]])
+; CHECK-NEXT: [[SCEVGEP42:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV38]], i32 -2
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[SCEVGEP42]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
+; CHECK-NEXT: [[SCEVGEP43:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV38]], i32 -1
+; CHECK-NEXT: [[WIDE_MASKED_LOAD18:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull [[SCEVGEP43]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK15]], <4 x i32> undef)
+; CHECK-NEXT: [[WIDE_MASKED_LOAD19:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull [[LSR_IV38]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK16]], <4 x i32> undef)
+; CHECK-NEXT: [[SCEVGEP41:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV38]], i32 1
+; CHECK-NEXT: [[WIDE_MASKED_LOAD20:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull [[SCEVGEP41]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK17]], <4 x i32> undef)
+; CHECK-NEXT: [[SCEVGEP34:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV31]], i32 -2
+; CHECK-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[SCEVGEP34]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
+; CHECK-NEXT: [[SCEVGEP35:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV31]], i32 -1
+; CHECK-NEXT: [[WIDE_MASKED_LOAD22:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull [[SCEVGEP35]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK15]], <4 x i32> undef)
+; CHECK-NEXT: [[WIDE_MASKED_LOAD23:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull [[LSR_IV31]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK16]], <4 x i32> undef)
+; CHECK-NEXT: [[SCEVGEP36:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV31]], i32 1
+; CHECK-NEXT: [[WIDE_MASKED_LOAD24:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull [[SCEVGEP36]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK17]], <4 x i32> undef)
+; CHECK-NEXT: [[V10:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD21]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT: [[V11:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD22]], [[WIDE_MASKED_LOAD18]]
+; CHECK-NEXT: [[V12:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD23]], [[WIDE_MASKED_LOAD19]]
+; CHECK-NEXT: [[V13:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD24]], [[WIDE_MASKED_LOAD20]]
+; CHECK-NEXT: [[SCEVGEP27:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV]], i32 -2
+; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V10]], ptr [[SCEVGEP27]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT: [[SCEVGEP28:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV]], i32 -1
+; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V11]], ptr [[SCEVGEP28]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK15]])
+; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V12]], ptr [[LSR_IV]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK16]])
+; CHECK-NEXT: [[SCEVGEP29:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV]], i32 1
+; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V13]], ptr [[SCEVGEP29]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK17]])
+; CHECK-NEXT: [[SCEVGEP25]] = getelementptr i32, ptr [[LSR_IV]], i32 16
+; CHECK-NEXT: [[SCEVGEP32]] = getelementptr i32, ptr [[LSR_IV31]], i32 16
+; CHECK-NEXT: [[SCEVGEP39]] = getelementptr i32, ptr [[LSR_IV38]], i32 16
+; CHECK-NEXT: [[V14]] = add i32 [[V9]], 4
+; CHECK-NEXT: [[V15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[V6]], i32 1)
+; CHECK-NEXT: [[V16:%.*]] = icmp ne i32 [[V15]], 0
+; CHECK-NEXT: br i1 [[V16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: ret void
+;
entry:
%cmp8 = icmp sgt i32 %N, 0
%v0 = add i32 %N, 15
@@ -370,12 +597,42 @@ for.cond.cleanup:
ret void
}
-; CHECK-LABEL: const_expected_in_set_loop
-; CHECK: call <4 x i1> @llvm.get.active.lane.mask
-; CHECK-NOT: vctp
-; CHECK: ret void
-;
define dso_local void @const_expected_in_set_loop(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @const_expected_in_set_loop(
+; CHECK-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias readonly captures(none) [[C:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], 3
+; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 2
+; CHECK-NEXT: [[TMP2:%.*]] = shl nuw i32 [[TMP1]], 2
+; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], -4
+; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 2
+; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i32 [[TMP4]], 1
+; CHECK-NEXT: br i1 [[CMP8]], label %[[VECTOR_PH:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP5]])
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[LSR_IV17:%.*]] = phi ptr [ [[SCEVGEP18:%.*]], %[[VECTOR_BODY]] ], [ [[A]], %[[VECTOR_PH]] ]
+; CHECK-NEXT: [[LSR_IV14:%.*]] = phi ptr [ [[SCEVGEP15:%.*]], %[[VECTOR_BODY]] ], [ [[C]], %[[VECTOR_PH]] ]
+; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[VECTOR_BODY]] ], [ [[B]], %[[VECTOR_PH]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 42)
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
+; CHECK-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV14]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
+; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD12]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP7]], ptr [[LSR_IV17]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
+; CHECK-NEXT: [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4
+; CHECK-NEXT: [[SCEVGEP18]] = getelementptr i32, ptr [[LSR_IV17]], i32 4
+; CHECK-NEXT: [[TMP8]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP6]], i32 1)
+; CHECK-NEXT: [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
+; CHECK-NEXT: br i1 [[TMP9]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: ret void
+;
entry:
%cmp8 = icmp sgt i32 %N, 0
%0 = add i32 %N, 3
@@ -413,12 +670,42 @@ for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
-; CHECK-LABEL: tripcount_arg_not_invariant
-; CHECK: call <4 x i1> @llvm.get.active.lane.mask
-; CHECK-NOT: vctp
-; CHECK: ret void
-;
define dso_local void @tripcount_arg_not_invariant(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @tripcount_arg_not_invariant(
+; CHECK-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias readonly captures(none) [[C:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], 3
+; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 2
+; CHECK-NEXT: [[TMP2:%.*]] = shl nuw i32 [[TMP1]], 2
+; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], -4
+; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 2
+; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i32 [[TMP4]], 1
+; CHECK-NEXT: br i1 [[CMP8]], label %[[VECTOR_PH:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP5]])
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[LSR_IV17:%.*]] = phi ptr [ [[SCEVGEP18:%.*]], %[[VECTOR_BODY]] ], [ [[A]], %[[VECTOR_PH]] ]
+; CHECK-NEXT: [[LSR_IV14:%.*]] = phi ptr [ [[SCEVGEP15:%.*]], %[[VECTOR_BODY]] ], [ [[C]], %[[VECTOR_PH]] ]
+; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[VECTOR_BODY]] ], [ [[B]], %[[VECTOR_PH]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[INDEX]])
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
+; CHECK-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV14]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
+; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD12]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP7]], ptr [[LSR_IV17]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
+; CHECK-NEXT: [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4
+; CHECK-NEXT: [[SCEVGEP18]] = getelementptr i32, ptr [[LSR_IV17]], i32 4
+; CHECK-NEXT: [[TMP8]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP6]], i32 1)
+; CHECK-NEXT: [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
+; CHECK-NEXT: br i1 [[TMP9]], label %[[VECTOR_BODY]], label %[[VECTOR_PH]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: ret void
+;
entry:
%cmp8 = icmp sgt i32 %N, 0
%0 = add i32 %N, 3
@@ -458,12 +745,42 @@ for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
-; CHECK-LABEL: addrec_base_not_zero
-; CHECK: call <4 x i1> @llvm.get.active.lane.mask
-; CHECK-NOT: vctp
-; CHECK: ret void
-;
define dso_local void @addrec_base_not_zero(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @addrec_base_not_zero(
+; CHECK-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias readonly captures(none) [[C:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], 3
+; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 2
+; CHECK-NEXT: [[TMP2:%.*]] = shl nuw i32 [[TMP1]], 2
+; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], -4
+; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 2
+; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i32 [[TMP4]], 1
+; CHECK-NEXT: br i1 [[CMP8]], label %[[VECTOR_PH:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP5]])
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[LSR_IV17:%.*]] = phi ptr [ [[SCEVGEP18:%.*]], %[[VECTOR_BODY]] ], [ [[A]], %[[VECTOR_PH]] ]
+; CHECK-NEXT: [[LSR_IV14:%.*]] = phi ptr [ [[SCEVGEP15:%.*]], %[[VECTOR_BODY]] ], [ [[C]], %[[VECTOR_PH]] ]
+; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[VECTOR_BODY]] ], [ [[B]], %[[VECTOR_PH]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 1, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
+; CHECK-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV14]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
+; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD12]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP7]], ptr [[LSR_IV17]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
+; CHECK-NEXT: [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4
+; CHECK-NEXT: [[SCEVGEP18]] = getelementptr i32, ptr [[LSR_IV17]], i32 4
+; CHECK-NEXT: [[TMP8]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP6]], i32 1)
+; CHECK-NEXT: [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
+; CHECK-NEXT: br i1 [[TMP9]], label %[[VECTOR_BODY]], label %[[VECTOR_PH]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: ret void
+;
entry:
%cmp8 = icmp sgt i32 %N, 0
%0 = add i32 %N, 3
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll
index fa6a66b..9775cf9 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll
@@ -1,15 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --prefix-filecheck-ir-name INST --version 6
; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve %s -S -o - | FileCheck %s
-; CHECK-LABEL: reduction_i32
-; CHECK: phi i32 [ 0, %vector.ph ]
-; CHECK: phi <8 x i16> [ zeroinitializer, %vector.ph ]
-; CHECK: phi i32
-; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[ELEMS:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[PHI]])
-; CHECK: [[ELEMS]] = sub i32 [[PHI]], 8
-; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %tmp2, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
-; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %tmp5, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
define i16 @reduction_i32(ptr nocapture readonly %A, ptr nocapture readonly %B, i32 %N) {
+; CHECK-LABEL: define i16 @reduction_i32(
+; CHECK-SAME: ptr readonly captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[TMP:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP]], 8
+; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
+; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N_VEC]], -8
+; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], 1
+; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP2]])
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i16> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP3:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INSTTMP2:%.*]] = getelementptr inbounds i16, ptr [[A]], i32 [[INDEX]]
+; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP4]])
+; CHECK-NEXT: [[TMP6]] = sub i32 [[TMP4]], 8
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[INSTTMP2]], i32 4, <8 x i1> [[TMP5]], <8 x i16> undef)
+; CHECK-NEXT: [[INSTTMP5:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[INDEX]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[INSTTMP5]], i32 4, <8 x i1> [[TMP5]], <8 x i16> undef)
+; CHECK-NEXT: [[TMP7:%.*]] = add <8 x i16> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT: [[TMP8]] = add <8 x i16> [[TMP7]], [[WIDE_MASKED_LOAD3]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; CHECK-NEXT: [[TMP9]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP3]], i32 1)
+; CHECK-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP9]], 0
+; CHECK-NEXT: br i1 [[TMP12]], label %[[VECTOR_BODY]], label %[[MIDDLE_BLOCK:.*]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[VEC_PHI_LCSSA:%.*]] = phi <8 x i16> [ [[VEC_PHI]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[DOTLCSSA3:%.*]] = phi <8 x i1> [ [[TMP5]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi <8 x i16> [ [[TMP8]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[DOTLCSSA3]], <8 x i16> [[DOTLCSSA]], <8 x i16> [[VEC_PHI_LCSSA]]
+; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i16> [[TMP10]], <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[BIN_RDX:%.*]] = add <8 x i16> [[RDX_SHUF]], [[TMP10]]
+; CHECK-NEXT: [[RDX_SHUF4:%.*]] = shufflevector <8 x i16> [[BIN_RDX]], <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[BIN_RDX5:%.*]] = add <8 x i16> [[RDX_SHUF4]], [[BIN_RDX]]
+; CHECK-NEXT: [[RDX_SHUF6:%.*]] = shufflevector <8 x i16> [[BIN_RDX5]], <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[BIN_RDX7:%.*]] = add <8 x i16> [[RDX_SHUF6]], [[BIN_RDX5]]
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[BIN_RDX7]], i32 0
+; CHECK-NEXT: ret i16 [[TMP11]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: [[RES_0:%.*]] = phi i16 [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: ret i16 [[RES_0]]
+;
entry:
%cmp8 = icmp eq i32 %N, 0
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
@@ -59,16 +99,52 @@ for.cond.cleanup:
ret i16 %res.0
}
-; CHECK-LABEL: reduction_i32_with_scalar
-; CHECK: vector.body:
-; CHECK: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; CHECK: %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %{{.*}}, %vector.body ]
-; CHECK: %{{.*}} = phi i32 [ %{{.*}}, %vector.ph ], [ %{{.*}}, %vector.body ]
-; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[ELEMS:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[PHI]])
-; CHECK: [[ELEMS]] = sub i32 [[PHI]], 8
-; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %tmp2, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
define i16 @reduction_i32_with_scalar(ptr nocapture readonly %A, i16 %B, i32 %N) local_unnamed_addr {
+; CHECK-LABEL: define i16 @reduction_i32_with_scalar(
+; CHECK-SAME: ptr readonly captures(none) [[A:%.*]], i16 [[B:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[TMP:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw nsw i32 [[TMP]], 8
+; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <8 x i16> undef, i16 [[B]], i32 0
+; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT3]], <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N_VEC]], -8
+; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1
+; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP2]])
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i16> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[INSTTMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP3:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INSTTMP2:%.*]] = getelementptr inbounds i16, ptr [[A]], i32 [[INDEX]]
+; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP4]])
+; CHECK-NEXT: [[TMP6]] = sub i32 [[TMP4]], 8
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[INSTTMP2]], i32 4, <8 x i1> [[TMP5]], <8 x i16> undef)
+; CHECK-NEXT: [[INSTTMP5:%.*]] = add <8 x i16> [[VEC_PHI]], [[BROADCAST_SPLAT4]]
+; CHECK-NEXT: [[INSTTMP6]] = add <8 x i16> [[INSTTMP5]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw nsw i32 [[INDEX]], 8
+; CHECK-NEXT: [[TMP7]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP3]], i32 1)
+; CHECK-NEXT: [[TMP8:%.*]] = icmp ne i32 [[TMP7]], 0
+; CHECK-NEXT: br i1 [[TMP8]], label %[[VECTOR_BODY]], label %[[MIDDLE_BLOCK:.*]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[INSTTMP8:%.*]] = select <8 x i1> [[TMP5]], <8 x i16> [[INSTTMP6]], <8 x i16> [[VEC_PHI]]
+; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i16> [[INSTTMP8]], <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[BIN_RDX:%.*]] = add <8 x i16> [[RDX_SHUF]], [[INSTTMP8]]
+; CHECK-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <8 x i16> [[BIN_RDX]], <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[BIN_RDX6:%.*]] = add <8 x i16> [[RDX_SHUF5]], [[BIN_RDX]]
+; CHECK-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <8 x i16> [[BIN_RDX6]], <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[BIN_RDX8:%.*]] = add <8 x i16> [[RDX_SHUF7]], [[BIN_RDX6]]
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[BIN_RDX8]], i32 0
+; CHECK-NEXT: ret i16 [[TMP9]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: [[RES_0:%.*]] = phi i16 [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: ret i16 [[RES_0]]
+;
entry:
%cmp8 = icmp eq i32 %N, 0
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
@@ -119,15 +195,46 @@ for.cond.cleanup:
; despite this we can still calculate a precise enough range so that the
; the overflow checks for get.active.active.lane.mask don't reject
; tail-predication.
-;
-; CHECK-LABEL: @reduction_not_guarded
-;
-; CHECK: vector.body:
-; CHECK: @llvm.arm.mve.vctp
-; CHECK-NOT: @llvm.get.active.lane.mask.v8i1.i32
-; CHECK: ret
-;
define i16 @reduction_not_guarded(ptr nocapture readonly %A, i16 %B, i32 %N) local_unnamed_addr {
+; CHECK-LABEL: define i16 @reduction_not_guarded(
+; CHECK-SAME: ptr readonly captures(none) [[A:%.*]], i16 [[B:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[TMP:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw nsw i32 [[TMP]], 8
+; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <8 x i16> undef, i16 [[B]], i32 0
+; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT3]], <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N_VEC]], -8
+; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1
+; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP2]])
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i16> [ zeroinitializer, %[[ENTRY]] ], [ [[INSTTMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP3:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ [[N]], %[[ENTRY]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INSTTMP2:%.*]] = getelementptr inbounds i16, ptr [[A]], i32 [[INDEX]]
+; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP4]])
+; CHECK-NEXT: [[TMP6]] = sub i32 [[TMP4]], 8
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[INSTTMP2]], i32 4, <8 x i1> [[TMP5]], <8 x i16> undef)
+; CHECK-NEXT: [[INSTTMP5:%.*]] = add <8 x i16> [[VEC_PHI]], [[BROADCAST_SPLAT4]]
+; CHECK-NEXT: [[INSTTMP6]] = add <8 x i16> [[INSTTMP5]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw nsw i32 [[INDEX]], 8
+; CHECK-NEXT: [[TMP7]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP3]], i32 1)
+; CHECK-NEXT: [[TMP8:%.*]] = icmp ne i32 [[TMP7]], 0
+; CHECK-NEXT: br i1 [[TMP8]], label %[[VECTOR_BODY]], label %[[MIDDLE_BLOCK:.*]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[INSTTMP8:%.*]] = select <8 x i1> [[TMP5]], <8 x i16> [[INSTTMP6]], <8 x i16> [[VEC_PHI]]
+; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i16> [[INSTTMP8]], <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[BIN_RDX:%.*]] = add <8 x i16> [[RDX_SHUF]], [[INSTTMP8]]
+; CHECK-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <8 x i16> [[BIN_RDX]], <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[BIN_RDX6:%.*]] = add <8 x i16> [[RDX_SHUF5]], [[BIN_RDX]]
+; CHECK-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <8 x i16> [[BIN_RDX6]], <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[BIN_RDX8:%.*]] = add <8 x i16> [[RDX_SHUF7]], [[BIN_RDX6]]
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[BIN_RDX8]], i32 0
+; CHECK-NEXT: ret i16 [[TMP9]]
+;
entry:
%tmp = add i32 %N, -1
%n.rnd.up = add nuw nsw i32 %tmp, 8
@@ -166,12 +273,76 @@ middle.block: ; preds = %vector.body
ret i16 %tmp9
}
-; CHECK-LABEL: @Correlation
-; CHECK: vector.body:
-; CHECK: @llvm.arm.mve.vctp
-; CHECK-NOT: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask
-;
define dso_local void @Correlation(ptr nocapture readonly %Input, ptr nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @Correlation(
+; CHECK-SAME: ptr readonly captures(none) [[INPUT:%.*]], ptr captures(none) [[OUTPUT:%.*]], i16 signext [[SIZE:%.*]], i16 signext [[N:%.*]], i16 signext [[SCALE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[N]] to i32
+; CHECK-NEXT: [[CMP36:%.*]] = icmp sgt i16 [[N]], 0
+; CHECK-NEXT: br i1 [[CMP36]], label %[[FOR_BODY_LR_PH:.*]], label %[[FOR_END17:.*]]
+; CHECK: [[FOR_BODY_LR_PH]]:
+; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[SIZE]] to i32
+; CHECK-NEXT: [[CONV1032:%.*]] = zext i16 [[SCALE]] to i32
+; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[CONV2]], 3
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[LSR_IV51:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], %[[FOR_END:.*]] ], [ [[TMP0]], %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT: [[LSR_IV46:%.*]] = phi ptr [ [[SCEVGEP47:%.*]], %[[FOR_END]] ], [ [[INPUT]], %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT: [[I_037:%.*]] = phi i32 [ 0, %[[FOR_BODY_LR_PH]] ], [ [[INC16:%.*]], %[[FOR_END]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = mul nsw i32 [[I_037]], -1
+; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[CONV2]], [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = mul nsw i32 [[I_037]], -1
+; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP0]], [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP4]], 2
+; CHECK-NEXT: [[TMP6:%.*]] = shl nuw i32 [[TMP5]], 2
+; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], -4
+; CHECK-NEXT: [[TMP8:%.*]] = lshr i32 [[TMP7]], 2
+; CHECK-NEXT: [[TMP9:%.*]] = add nuw nsw i32 [[TMP8]], 1
+; CHECK-NEXT: [[CMP433:%.*]] = icmp slt i32 [[I_037]], [[CONV2]]
+; CHECK-NEXT: br i1 [[CMP433]], label %[[VECTOR_PH:.*]], label %[[FOR_END]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP9]])
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[LSR_IV48:%.*]] = phi ptr [ [[SCEVGEP49:%.*]], %[[VECTOR_BODY]] ], [ [[LSR_IV46]], %[[VECTOR_PH]] ]
+; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[VECTOR_BODY]] ], [ [[INPUT]], %[[VECTOR_PH]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP20:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP10:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP21:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP11:%.*]] = phi i32 [ [[TMP2]], %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP12:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP11]])
+; CHECK-NEXT: [[TMP13]] = sub i32 [[TMP11]], 4
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[LSR_IV]], i32 2, <4 x i1> [[TMP12]], <4 x i16> undef)
+; CHECK-NEXT: [[TMP14:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32>
+; CHECK-NEXT: [[WIDE_MASKED_LOAD42:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[LSR_IV48]], i32 2, <4 x i1> [[TMP12]], <4 x i16> undef)
+; CHECK-NEXT: [[TMP15:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD42]] to <4 x i32>
+; CHECK-NEXT: [[TMP16:%.*]] = mul nsw <4 x i32> [[TMP15]], [[TMP14]]
+; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> undef, i32 [[CONV1032]], i32 0
+; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP19:%.*]] = ashr <4 x i32> [[TMP16]], [[TMP18]]
+; CHECK-NEXT: [[TMP20]] = add <4 x i32> [[TMP19]], [[VEC_PHI]]
+; CHECK-NEXT: [[SCEVGEP]] = getelementptr i16, ptr [[LSR_IV]], i32 4
+; CHECK-NEXT: [[SCEVGEP49]] = getelementptr i16, ptr [[LSR_IV48]], i32 4
+; CHECK-NEXT: [[TMP21]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP10]], i32 1)
+; CHECK-NEXT: [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
+; CHECK-NEXT: br i1 [[TMP22]], label %[[VECTOR_BODY]], label %[[MIDDLE_BLOCK:.*]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP23:%.*]] = select <4 x i1> [[TMP12]], <4 x i32> [[TMP20]], <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT: [[TMP24:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP23]])
+; CHECK-NEXT: br label %[[FOR_END]]
+; CHECK: [[FOR_END]]:
+; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, %[[FOR_BODY]] ], [ [[TMP24]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[TMP25:%.*]] = lshr i32 [[SUM_0_LCSSA]], 16
+; CHECK-NEXT: [[CONV13:%.*]] = trunc i32 [[TMP25]] to i16
+; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i16, ptr [[OUTPUT]], i32 [[I_037]]
+; CHECK-NEXT: store i16 [[CONV13]], ptr [[ARRAYIDX14]], align 2
+; CHECK-NEXT: [[INC16]] = add nuw nsw i32 [[I_037]], 1
+; CHECK-NEXT: [[SCEVGEP47]] = getelementptr i16, ptr [[LSR_IV46]], i32 1
+; CHECK-NEXT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV51]], -1
+; CHECK-NEXT: [[EXITCOND39:%.*]] = icmp eq i32 [[INC16]], [[CONV]]
+; CHECK-NEXT: br i1 [[EXITCOND39]], label %[[FOR_END17]], label %[[FOR_BODY]]
+; CHECK: [[FOR_END17]]:
+; CHECK-NEXT: ret void
+;
entry:
%conv = sext i16 %N to i32
%cmp36 = icmp sgt i16 %N, 0
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll
index a8ad360..b54d526 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll
@@ -1,8 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve,+lob %s -S -o - | FileCheck %s
-; CHECK-LABEL: expand_v8i16_v8i32
-; CHECK-NOT: call i32 @llvm.arm.mve.vctp
define void @expand_v8i16_v8i32(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
+; CHECK-LABEL: define void @expand_v8i16_v8i32(
+; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias captures(none) [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[N]], 7
+; CHECK-NEXT: [[TMP9:%.*]] = lshr i32 [[TMP8]], 3
+; CHECK-NEXT: [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 3
+; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], -8
+; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 3
+; CHECK-NEXT: [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1
+; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]])
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i16, ptr [[A]], i32 [[INDEX]]
+; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP0]])
+; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 8
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP]], i32 4, <8 x i1> [[TMP1]], <8 x i16> undef)
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[INDEX]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP1]], <8 x i16> undef)
+; CHECK-NEXT: [[EXPAND_1:%.*]] = zext <8 x i16> [[WIDE_MASKED_LOAD]] to <8 x i32>
+; CHECK-NEXT: [[EXPAND_2:%.*]] = zext <8 x i16> [[WIDE_MASKED_LOAD2]] to <8 x i32>
+; CHECK-NEXT: [[MUL:%.*]] = mul nsw <8 x i32> [[EXPAND_2]], [[EXPAND_1]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[INDEX]]
+; CHECK-NEXT: tail call void @llvm.masked.store.v8i32.p0(<8 x i32> [[MUL]], ptr [[TMP6]], i32 4, <8 x i1> [[TMP1]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; CHECK-NEXT: [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1)
+; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+; CHECK-NEXT: br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: ret void
+;
entry:
%cmp8 = icmp eq i32 %N, 0
%tmp8 = add i32 %N, 7
@@ -39,15 +74,57 @@ for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
-; CHECK-LABEL: expand_v8i16_v4i32
-; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[ELEMS_REM:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[ELEMS]])
-; CHECK: [[ELEMS_REM]] = sub i32 [[ELEMS]], 8
-; CHECK: tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
-; CHECK: %store.pred = icmp ule <4 x i32> %induction.store
-; CHECK: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> {{.*}}, ptr {{.*}}, i32 4, <4 x i1> %store.pred)
-; CHECK: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> {{.*}}, ptr {{.*}}, i32 4, <4 x i1> %store.pred)
define void @expand_v8i16_v4i32(ptr readonly %a, ptr readonly %b, ptr %c, ptr %d, i32 %N) {
+; CHECK-LABEL: define void @expand_v8i16_v4i32(
+; CHECK-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[N]], 7
+; CHECK-NEXT: [[TMP9:%.*]] = lshr i32 [[TMP8]], 3
+; CHECK-NEXT: [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 3
+; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], -8
+; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 3
+; CHECK-NEXT: [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1
+; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT10_STORE:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
+; CHECK-NEXT: [[BROADCAST_SPLAT11_STORE:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT10_STORE]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]])
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[STORE_IDX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[STORE_IDX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i16, ptr [[A]], i32 [[INDEX]]
+; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP0]])
+; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 8
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP]], i32 4, <8 x i1> [[TMP1]], <8 x i16> undef)
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[INDEX]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP1]], <8 x i16> undef)
+; CHECK-NEXT: [[EXTRACT_2_LOW:%.*]] = shufflevector <8 x i16> [[WIDE_MASKED_LOAD2]], <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[EXTRACT_2_HIGH:%.*]] = shufflevector <8 x i16> [[WIDE_MASKED_LOAD2]], <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[EXPAND_1:%.*]] = zext <4 x i16> [[EXTRACT_2_LOW]] to <4 x i32>
+; CHECK-NEXT: [[EXPAND_2:%.*]] = zext <4 x i16> [[EXTRACT_2_HIGH]] to <4 x i32>
+; CHECK-NEXT: [[MUL:%.*]] = mul nsw <4 x i32> [[EXPAND_2]], [[EXPAND_1]]
+; CHECK-NEXT: [[SUB:%.*]] = mul nsw <4 x i32> [[EXPAND_1]], [[EXPAND_2]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT_STORE:%.*]] = insertelement <4 x i32> undef, i32 [[STORE_IDX]], i32 0
+; CHECK-NEXT: [[BROADCAST_SPLAT_STORE:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT_STORE]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[INDUCTION_STORE:%.*]] = add <4 x i32> [[BROADCAST_SPLAT_STORE]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[STORE_PRED:%.*]] = icmp ule <4 x i32> [[INDUCTION_STORE]], [[BROADCAST_SPLAT11_STORE]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[STORE_IDX]]
+; CHECK-NEXT: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[MUL]], ptr [[TMP6]], i32 4, <4 x i1> [[STORE_PRED]])
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[D]], i32 [[STORE_IDX]]
+; CHECK-NEXT: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[SUB]], ptr [[GEP]], i32 4, <4 x i1> [[STORE_PRED]])
+; CHECK-NEXT: [[STORE_IDX_NEXT]] = add i32 [[STORE_IDX]], 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; CHECK-NEXT: [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1)
+; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+; CHECK-NEXT: br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: ret void
+;
entry:
%cmp8 = icmp eq i32 %N, 0
%tmp8 = add i32 %N, 7
@@ -98,9 +175,43 @@ for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
-; CHECK-LABEL: expand_v4i32_v4i64
-; CHECK-NOT: call i32 @llvm.arm.mve.vctp
define void @expand_v4i32_v4i64(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
+; CHECK-LABEL: define void @expand_v4i32_v4i64(
+; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias captures(none) [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[N]], 3
+; CHECK-NEXT: [[TMP9:%.*]] = lshr i32 [[TMP8]], 2
+; CHECK-NEXT: [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 2
+; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], -4
+; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 2
+; CHECK-NEXT: [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1
+; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]])
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]]
+; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
+; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 4
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP3]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT: [[EXPAND_1:%.*]] = zext <4 x i32> [[WIDE_MASKED_LOAD]] to <4 x i64>
+; CHECK-NEXT: [[EXPAND_2:%.*]] = zext <4 x i32> [[WIDE_MASKED_LOAD2]] to <4 x i64>
+; CHECK-NEXT: [[MUL:%.*]] = mul nsw <4 x i64> [[EXPAND_2]], [[EXPAND_1]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[C]], i32 [[INDEX]]
+; CHECK-NEXT: tail call void @llvm.masked.store.v4i64.p0(<4 x i64> [[MUL]], ptr [[TMP6]], i32 4, <4 x i1> [[TMP1]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT: [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1)
+; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+; CHECK-NEXT: br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: ret void
+;
entry:
%cmp8 = icmp eq i32 %N, 0
%tmp8 = add i32 %N, 3
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
index ec542df..fb1a4a4 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
@@ -1,24 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve %s -S -o - | FileCheck %s
-; CHECK-LABEL: vec_mul_reduce_add
-
-; CHECK: vector.ph:
-; CHECK: %start = call i32 @llvm.start.loop.iterations.i32
-; CHECK: br label %vector.body
-
-; CHECK: vector.body:
-; CHECK: [[ELTS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[SUB:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELTS]])
-; CHECK: [[SUB]] = sub i32 [[ELTS]], 4
-; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]]
-; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]],
-
-; CHECK: middle.block:
-; CHECK: [[VPSEL:%[^ ]+]] = select <4 x i1> [[VCTP]],
-; CHECK: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VPSEL]])
-
define i32 @vec_mul_reduce_add(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i32 %N) {
+; CHECK-LABEL: define i32 @vec_mul_reduce_add(
+; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[CMP8:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], 3
+; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 2
+; CHECK-NEXT: [[TMP2:%.*]] = shl nuw i32 [[TMP1]], 2
+; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], -4
+; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 2
+; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i32 [[TMP4]], 1
+; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP5]])
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[LSR_IV2:%.*]] = phi ptr [ [[SCEVGEP3:%.*]], %[[VECTOR_BODY]] ], [ [[A]], %[[VECTOR_PH]] ]
+; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[VECTOR_BODY]] ], [ [[B]], %[[VECTOR_PH]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP7:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP8:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP7]])
+; CHECK-NEXT: [[TMP9]] = sub i32 [[TMP7]], 4
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV2]], i32 4, <4 x i1> [[TMP8]], <4 x i32> undef)
+; CHECK-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP8]], <4 x i32> undef)
+; CHECK-NEXT: [[TMP10:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD13]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT: [[TMP11]] = add nsw <4 x i32> [[TMP10]], [[VEC_PHI]]
+; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
+; CHECK-NEXT: [[SCEVGEP3]] = getelementptr i32, ptr [[LSR_IV2]], i32 4
+; CHECK-NEXT: [[TMP12]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP6]], i32 1)
+; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i32 [[TMP12]], 0
+; CHECK-NEXT: br i1 [[TMP13]], label %[[VECTOR_BODY]], label %[[MIDDLE_BLOCK:.*]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP14:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> [[TMP11]], <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP14]])
+; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP15]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: ret i32 [[RES_0_LCSSA]]
+;
entry:
%cmp8 = icmp eq i32 %N, 0
%0 = add i32 %N, 3
diff --git a/llvm/test/CodeGen/Thumb2/carry.ll b/llvm/test/CodeGen/Thumb2/carry.ll
index 1e2b332..47c7918 100644
--- a/llvm/test/CodeGen/Thumb2/carry.ll
+++ b/llvm/test/CodeGen/Thumb2/carry.ll
@@ -1,35 +1,52 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
define i64 @f1(i64 %a, i64 %b) {
-entry:
; CHECK-LABEL: f1:
-; CHECK: subs r0, r0, r2
-; CHECK: sbcs r1, r3
- %tmp = sub i64 %a, %b
- ret i64 %tmp
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: subs r0, r0, r2
+; CHECK-NEXT: sbcs r1, r3
+; CHECK-NEXT: bx lr
+entry:
+ %tmp = sub i64 %a, %b
+ ret i64 %tmp
}
define i64 @f2(i64 %a, i64 %b) {
-entry:
; CHECK-LABEL: f2:
-; CHECK: lsls r1, r1, #1
-; CHECK: orr.w r1, r1, r0, lsr #31
-; CHECK: rsbs r0, r2, r0, lsl #1
-; CHECK: sbcs r1, r3
- %tmp1 = shl i64 %a, 1
- %tmp2 = sub i64 %tmp1, %b
- ret i64 %tmp2
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: lsls r1, r1, #1
+; CHECK-NEXT: orr.w r1, r1, r0, lsr #31
+; CHECK-NEXT: rsbs r0, r2, r0, lsl #1
+; CHECK-NEXT: sbcs r1, r3
+; CHECK-NEXT: bx lr
+entry:
+ %tmp1 = shl i64 %a, 1
+ %tmp2 = sub i64 %tmp1, %b
+ ret i64 %tmp2
}
; rdar://12559385
define i64 @f3(i32 %vi) {
-entry:
; CHECK-LABEL: f3:
-; CHECK: movw [[REG:r[0-9]+]], #36102
-; CHECK: sbcs r{{[0-9]+}}, [[REG]]
- %v0 = zext i32 %vi to i64
- %v1 = xor i64 %v0, -155057456198619
- %v4 = add i64 %v1, 155057456198619
- %v5 = add i64 %v4, %v1
- ret i64 %v5
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: movw r1, #19493
+; CHECK-NEXT: movt r1, #57191
+; CHECK-NEXT: eors r0, r1
+; CHECK-NEXT: movw r2, #29433
+; CHECK-NEXT: movw r3, #46043
+; CHECK-NEXT: movw r1, #36102
+; CHECK-NEXT: movt r2, #65535
+; CHECK-NEXT: adds r0, r0, r0
+; CHECK-NEXT: movt r3, #8344
+; CHECK-NEXT: sbcs r2, r1
+; CHECK-NEXT: adds r0, r0, r3
+; CHECK-NEXT: adcs r1, r2
+; CHECK-NEXT: bx lr
+entry:
+ %v0 = zext i32 %vi to i64
+ %v1 = xor i64 %v0, -155057456198619
+ %v4 = add i64 %v1, 155057456198619
+ %v5 = add i64 %v4, %v1
+ ret i64 %v5
}
diff --git a/llvm/test/CodeGen/WebAssembly/bulk-memory.ll b/llvm/test/CodeGen/WebAssembly/bulk-memory.ll
index ae170d7..d949068 100644
--- a/llvm/test/CodeGen/WebAssembly/bulk-memory.ll
+++ b/llvm/test/CodeGen/WebAssembly/bulk-memory.ll
@@ -104,6 +104,31 @@ define void @memset_i32(ptr %dest, i8 %val, i32 %len) {
ret void
}
+; CHECK-LABEL: memcpy_0:
+; CHECK-NEXT: .functype memcpy_0 (i32, i32) -> ()
+; CHECK-NEXT: return
+define void @memcpy_0(ptr %dest, ptr %src) {
+ call void @llvm.memcpy.p0.p0.i32(ptr %dest, ptr %src, i32 0, i1 0)
+ ret void
+}
+
+; CHECK-LABEL: memmove_0:
+; CHECK-NEXT: .functype memmove_0 (i32, i32) -> ()
+; CHECK-NEXT: return
+define void @memmove_0(ptr %dest, ptr %src) {
+ call void @llvm.memmove.p0.p0.i32(ptr %dest, ptr %src, i32 0, i1 0)
+ ret void
+}
+
+; CHECK-LABEL: memset_0:
+; NO-BULK-MEM-NOT: memory.fill
+; BULK-MEM-NEXT: .functype memset_0 (i32, i32) -> ()
+; BULK-MEM-NEXT: return
+define void @memset_0(ptr %dest, i8 %val) {
+ call void @llvm.memset.p0.i32(ptr %dest, i8 %val, i32 0, i1 0)
+ ret void
+}
+
; CHECK-LABEL: memcpy_1:
; CHECK-NEXT: .functype memcpy_1 (i32, i32) -> ()
; CHECK-NEXT: i32.load8_u $push[[L0:[0-9]+]]=, 0($1)
@@ -137,14 +162,8 @@ define void @memset_1(ptr %dest, i8 %val) {
; CHECK-LABEL: memcpy_1024:
; NO-BULK-MEM-NOT: memory.copy
; BULK-MEM-NEXT: .functype memcpy_1024 (i32, i32) -> ()
-; BULK-MEM-NEXT: block
; BULK-MEM-NEXT: i32.const $push[[L0:[0-9]+]]=, 1024
-; BULK-MEM-NEXT: i32.eqz $push[[L1:[0-9]+]]=, $pop[[L0]]
-; BULK-MEM-NEXT: br_if 0, $pop[[L1]]
-; BULK-MEM-NEXT: i32.const $push[[L2:[0-9]+]]=, 1024
-; BULK-MEM-NEXT: memory.copy 0, 0, $0, $1, $pop[[L2]]
-; BULK-MEM-NEXT: .LBB{{.*}}:
-; BULK-MEM-NEXT: end_block
+; BULK-MEM-NEXT: memory.copy 0, 0, $0, $1, $pop[[L0]]
; BULK-MEM-NEXT: return
define void @memcpy_1024(ptr %dest, ptr %src) {
call void @llvm.memcpy.p0.p0.i32(ptr %dest, ptr %src, i32 1024, i1 0)
@@ -154,14 +173,8 @@ define void @memcpy_1024(ptr %dest, ptr %src) {
; CHECK-LABEL: memmove_1024:
; NO-BULK-MEM-NOT: memory.copy
; BULK-MEM-NEXT: .functype memmove_1024 (i32, i32) -> ()
-; BULK-MEM-NEXT: block
; BULK-MEM-NEXT: i32.const $push[[L0:[0-9]+]]=, 1024
-; BULK-MEM-NEXT: i32.eqz $push[[L1:[0-9]+]]=, $pop[[L0]]
-; BULK-MEM-NEXT: br_if 0, $pop[[L1]]
-; BULK-MEM-NEXT: i32.const $push[[L2:[0-9]+]]=, 1024
-; BULK-MEM-NEXT: memory.copy 0, 0, $0, $1, $pop[[L2]]
-; BULK-MEM-NEXT: .LBB{{.*}}:
-; BULK-MEM-NEXT: end_block
+; BULK-MEM-NEXT: memory.copy 0, 0, $0, $1, $pop[[L0]]
; BULK-MEM-NEXT: return
define void @memmove_1024(ptr %dest, ptr %src) {
call void @llvm.memmove.p0.p0.i32(ptr %dest, ptr %src, i32 1024, i1 0)
@@ -171,14 +184,8 @@ define void @memmove_1024(ptr %dest, ptr %src) {
; CHECK-LABEL: memset_1024:
; NO-BULK-MEM-NOT: memory.fill
; BULK-MEM-NEXT: .functype memset_1024 (i32, i32) -> ()
-; BULK-MEM-NEXT: block
; BULK-MEM-NEXT: i32.const $push[[L0:[0-9]+]]=, 1024
-; BULK-MEM-NEXT: i32.eqz $push[[L1:[0-9]+]]=, $pop[[L0]]
-; BULK-MEM-NEXT: br_if 0, $pop[[L1]]
-; BULK-MEM-NEXT: i32.const $push[[L2:[0-9]+]]=, 1024
-; BULK-MEM-NEXT: memory.fill 0, $0, $1, $pop[[L2]]
-; BULK-MEM-NEXT: .LBB{{.*}}:
-; BULK-MEM-NEXT: end_block
+; BULK-MEM-NEXT: memory.fill 0, $0, $1, $pop[[L0]]
; BULK-MEM-NEXT: return
define void @memset_1024(ptr %dest, i8 %val) {
call void @llvm.memset.p0.i32(ptr %dest, i8 %val, i32 1024, i1 0)
@@ -201,17 +208,11 @@ define void @memset_1024(ptr %dest, i8 %val) {
; BULK-MEM-NEXT: .functype memcpy_alloca_src (i32) -> ()
; BULK-MEM-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer
; BULK-MEM-NEXT: i32.const $push[[L1:[0-9]+]]=, 112
-; BULK-MEM-NEXT: i32.sub $[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
-; BULK-MEM-NEXT: block
-; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 100
-; BULK-MEM-NEXT: i32.eqz $push[[L4:[0-9]+]]=, $pop[[L3]]
-; BULK-MEM-NEXT: br_if 0, $pop[[L4]]
-; BULK-MEM-NEXT: i32.const $push[[L5:[0-9]+]]=, 12
-; BULK-MEM-NEXT: i32.add $push[[L6:[0-9]+]]=, $[[L2]], $pop[[L5]]
-; BULK-MEM-NEXT: i32.const $push[[L7:[0-9]+]]=, 100
-; BULK-MEM-NEXT: memory.copy 0, 0, $0, $pop[[L6]], $pop[[L7]]
-; BULK-MEM-NEXT: .LBB{{.*}}:
-; BULK-MEM-NEXT: end_block
+; BULK-MEM-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
+; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 12
+; BULK-MEM-NEXT: i32.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]
+; BULK-MEM-NEXT: i32.const $push[[L5:[0-9]+]]=, 100
+; BULK-MEM-NEXT: memory.copy 0, 0, $0, $pop[[L4]], $pop[[L5]]
; BULK-MEM-NEXT: return
define void @memcpy_alloca_src(ptr %dst) {
%a = alloca [100 x i8]
@@ -224,17 +225,11 @@ define void @memcpy_alloca_src(ptr %dst) {
; BULK-MEM-NEXT: .functype memcpy_alloca_dst (i32) -> ()
; BULK-MEM-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer
; BULK-MEM-NEXT: i32.const $push[[L1:[0-9]+]]=, 112
-; BULK-MEM-NEXT: i32.sub $[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
-; BULK-MEM-NEXT: block
-; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 100
-; BULK-MEM-NEXT: i32.eqz $push[[L4:[0-9]+]]=, $pop[[L3]]
-; BULK-MEM-NEXT: br_if 0, $pop[[L4]]
-; BULK-MEM-NEXT: i32.const $push[[L5:[0-9]+]]=, 12
-; BULK-MEM-NEXT: i32.add $push[[L6:[0-9]+]]=, $[[L2]], $pop[[L5]]
-; BULK-MEM-NEXT: i32.const $push[[L7:[0-9]+]]=, 100
-; BULK-MEM-NEXT: memory.copy 0, 0, $pop[[L6]], $0, $pop[[L7]]
-; BULK-MEM-NEXT: .LBB{{.*}}:
-; BULK-MEM-NEXT: end_block
+; BULK-MEM-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
+; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 12
+; BULK-MEM-NEXT: i32.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]
+; BULK-MEM-NEXT: i32.const $push[[L5:[0-9]+]]=, 100
+; BULK-MEM-NEXT: memory.copy 0, 0, $pop[[L4]], $0, $pop[[L5]]
; BULK-MEM-NEXT: return
define void @memcpy_alloca_dst(ptr %src) {
%a = alloca [100 x i8]
@@ -247,17 +242,11 @@ define void @memcpy_alloca_dst(ptr %src) {
; BULK-MEM-NEXT: .functype memset_alloca (i32) -> ()
; BULK-MEM-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer
; BULK-MEM-NEXT: i32.const $push[[L1:[0-9]+]]=, 112
-; BULK-MEM-NEXT: i32.sub $1=, $pop[[L0]], $pop[[L1]]
-; BULK-MEM-NEXT: block
-; BULK-MEM-NEXT: i32.const $push[[L2:[0-9]+]]=, 100
-; BULK-MEM-NEXT: i32.eqz $push[[L3:[0-9]+]]=, $pop[[L2]]
-; BULK-MEM-NEXT: br_if 0, $pop[[L3]]
-; BULK-MEM-NEXT: i32.const $push[[L4:[0-9]+]]=, 12
-; BULK-MEM-NEXT: i32.add $push[[L5:[0-9]+]]=, $1, $pop[[L4]]
-; BULK-MEM-NEXT: i32.const $push[[L6:[0-9]+]]=, 100
-; BULK-MEM-NEXT: memory.fill 0, $pop[[L5]], $0, $pop[[L6]]
-; BULK-MEM-NEXT: .LBB{{.*}}:
-; BULK-MEM-NEXT: end_block
+; BULK-MEM-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
+; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 12
+; BULK-MEM-NEXT: i32.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]
+; BULK-MEM-NEXT: i32.const $push[[L5:[0-9]+]]=, 100
+; BULK-MEM-NEXT: memory.fill 0, $pop[[L4]], $0, $pop[[L5]]
; BULK-MEM-NEXT: return
define void @memset_alloca(i8 %val) {
%a = alloca [100 x i8]
diff --git a/llvm/test/CodeGen/WebAssembly/bulk-memory64.ll b/llvm/test/CodeGen/WebAssembly/bulk-memory64.ll
index 0cf8493..d0206a3 100644
--- a/llvm/test/CodeGen/WebAssembly/bulk-memory64.ll
+++ b/llvm/test/CodeGen/WebAssembly/bulk-memory64.ll
@@ -110,6 +110,31 @@ define void @memset_i32(ptr %dest, i8 %val, i64 %len) {
ret void
}
+; CHECK-LABEL: memcpy_0:
+; CHECK-NEXT: .functype memcpy_0 (i64, i64) -> ()
+; CHECK-NEXT: return
+define void @memcpy_0(ptr %dest, ptr %src) {
+ call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 0, i1 0)
+ ret void
+}
+
+; CHECK-LABEL: memmove_0:
+; CHECK-NEXT: .functype memmove_0 (i64, i64) -> ()
+; CHECK-NEXT: return
+define void @memmove_0(ptr %dest, ptr %src) {
+ call void @llvm.memmove.p0.p0.i64(ptr %dest, ptr %src, i64 0, i1 0)
+ ret void
+}
+
+; CHECK-LABEL: memset_0:
+; NO-BULK-MEM-NOT: memory.fill
+; BULK-MEM-NEXT: .functype memset_0 (i64, i32) -> ()
+; BULK-MEM-NEXT: return
+define void @memset_0(ptr %dest, i8 %val) {
+ call void @llvm.memset.p0.i64(ptr %dest, i8 %val, i64 0, i1 0)
+ ret void
+}
+
; CHECK-LABEL: memcpy_1:
; CHECK-NEXT: .functype memcpy_1 (i64, i64) -> ()
; CHECK-NEXT: i32.load8_u $push[[L0:[0-9]+]]=, 0($1)
@@ -143,14 +168,8 @@ define void @memset_1(ptr %dest, i8 %val) {
; CHECK-LABEL: memcpy_1024:
; NO-BULK-MEM-NOT: memory.copy
; BULK-MEM-NEXT: .functype memcpy_1024 (i64, i64) -> ()
-; BULK-MEM-NEXT: block
-; BULK-MEM-NEXT: i64.const $push[[L1:[0-9]+]]=, 1024
-; BULK-MEM-NEXT: i64.eqz $push0=, $pop[[L1]]
-; BULK-MEM-NEXT: br_if 0, $pop0
; BULK-MEM-NEXT: i64.const $push[[L0:[0-9]+]]=, 1024
; BULK-MEM-NEXT: memory.copy 0, 0, $0, $1, $pop[[L0]]
-; BULK-MEM-NEXT: .LBB{{.*}}:
-; BULK-MEM-NEXT: end_block
; BULK-MEM-NEXT: return
define void @memcpy_1024(ptr %dest, ptr %src) {
call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 1024, i1 0)
@@ -160,14 +179,8 @@ define void @memcpy_1024(ptr %dest, ptr %src) {
; CHECK-LABEL: memmove_1024:
; NO-BULK-MEM-NOT: memory.copy
; BULK-MEM-NEXT: .functype memmove_1024 (i64, i64) -> ()
-; BULK-MEM-NEXT: block
-; BULK-MEM-NEXT: i64.const $push[[L1:[0-9]+]]=, 1024
-; BULK-MEM-NEXT: i64.eqz $push0=, $pop[[L1]]
-; BULK-MEM-NEXT: br_if 0, $pop0
; BULK-MEM-NEXT: i64.const $push[[L0:[0-9]+]]=, 1024
; BULK-MEM-NEXT: memory.copy 0, 0, $0, $1, $pop[[L0]]
-; BULK-MEM-NEXT: .LBB{{.*}}:
-; BULK-MEM-NEXT: end_block
; BULK-MEM-NEXT: return
define void @memmove_1024(ptr %dest, ptr %src) {
call void @llvm.memmove.p0.p0.i64(ptr %dest, ptr %src, i64 1024, i1 0)
@@ -177,14 +190,8 @@ define void @memmove_1024(ptr %dest, ptr %src) {
; CHECK-LABEL: memset_1024:
; NO-BULK-MEM-NOT: memory.fill
; BULK-MEM-NEXT: .functype memset_1024 (i64, i32) -> ()
-; BULK-MEM-NEXT: block
-; BULK-MEM-NEXT: i64.const $push[[L1:[0-9]+]]=, 1024
-; BULK-MEM-NEXT: i64.eqz $push0=, $pop[[L1]]
-; BULK-MEM-NEXT: br_if 0, $pop0
; BULK-MEM-NEXT: i64.const $push[[L0:[0-9]+]]=, 1024
; BULK-MEM-NEXT: memory.fill 0, $0, $1, $pop[[L0]]
-; BULK-MEM-NEXT: .LBB{{.*}}:
-; BULK-MEM-NEXT: end_block
; BULK-MEM-NEXT: return
define void @memset_1024(ptr %dest, i8 %val) {
call void @llvm.memset.p0.i64(ptr %dest, i8 %val, i64 1024, i1 0)
@@ -207,17 +214,11 @@ define void @memset_1024(ptr %dest, i8 %val) {
; BULK-MEM-NEXT: .functype memcpy_alloca_src (i64) -> ()
; BULK-MEM-NEXT: global.get $push[[L1:[0-9]+]]=, __stack_pointer
; BULK-MEM-NEXT: i64.const $push[[L0:[0-9]+]]=, 112
-; BULK-MEM-NEXT: i64.sub $[[L2:[0-9]+]]=, $pop[[L1]], $pop[[L0]]
-; BULK-MEM-NEXT: block
-; BULK-MEM-NEXT: i64.const $push[[L3:[0-9]+]]=, 100
-; BULK-MEM-NEXT: i64.eqz $push[[L4:[0-9]+]]=, $pop[[L3]]
-; BULK-MEM-NEXT: br_if 0, $pop[[L4]]
-; BULK-MEM-NEXT: i64.const $push[[L5:[0-9]+]]=, 12
-; BULK-MEM-NEXT: i64.add $push[[L6:[0-9]+]]=, $[[L2]], $pop[[L5]]
-; BULK-MEM-NEXT: i64.const $push[[L7:[0-9]+]]=, 100
-; BULK-MEM-NEXT: memory.copy 0, 0, $0, $pop[[L6]], $pop[[L7]]
-; BULK-MEM-NEXT: .LBB{{.*}}:
-; BULK-MEM-NEXT: end_block
+; BULK-MEM-NEXT: i64.sub $push[[L2:[0-9]+]]=, $pop[[L1]], $pop[[L0]]
+; BULK-MEM-NEXT: i64.const $push[[L3:[0-9]+]]=, 12
+; BULK-MEM-NEXT: i64.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]
+; BULK-MEM-NEXT: i64.const $push[[L5:[0-9]+]]=, 100
+; BULK-MEM-NEXT: memory.copy 0, 0, $0, $pop[[L4]], $pop[[L5]]
; BULK-MEM-NEXT: return
define void @memcpy_alloca_src(ptr %dst) {
%a = alloca [100 x i8]
@@ -230,17 +231,11 @@ define void @memcpy_alloca_src(ptr %dst) {
; BULK-MEM-NEXT: .functype memcpy_alloca_dst (i64) -> ()
; BULK-MEM-NEXT: global.get $push[[L1:[0-9]+]]=, __stack_pointer
; BULK-MEM-NEXT: i64.const $push[[L0:[0-9]+]]=, 112
-; BULK-MEM-NEXT: i64.sub $[[L2:[0-9]+]]=, $pop[[L1]], $pop[[L0]]
-; BULK-MEM-NEXT: block
-; BULK-MEM-NEXT: i64.const $push[[L3:[0-9]+]]=, 100
-; BULK-MEM-NEXT: i64.eqz $push[[L4:[0-9]+]]=, $pop[[L3]]
-; BULK-MEM-NEXT: br_if 0, $pop[[L4]]
-; BULK-MEM-NEXT: i64.const $push[[L5:[0-9]+]]=, 12
-; BULK-MEM-NEXT: i64.add $push[[L6:[0-9]+]]=, $[[L2]], $pop[[L5]]
-; BULK-MEM-NEXT: i64.const $push[[L7:[0-9]+]]=, 100
-; BULK-MEM-NEXT: memory.copy 0, 0, $pop[[L6]], $0, $pop[[L7]]
-; BULK-MEM-NEXT: .LBB{{.*}}:
-; BULK-MEM-NEXT: end_block
+; BULK-MEM-NEXT: i64.sub $push[[L2:[0-9]+]]=, $pop[[L1]], $pop[[L0]]
+; BULK-MEM-NEXT: i64.const $push[[L3:[0-9]+]]=, 12
+; BULK-MEM-NEXT: i64.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]
+; BULK-MEM-NEXT: i64.const $push[[L5:[0-9]+]]=, 100
+; BULK-MEM-NEXT: memory.copy 0, 0, $pop[[L4]], $0, $pop[[L5]]
; BULK-MEM-NEXT: return
define void @memcpy_alloca_dst(ptr %src) {
%a = alloca [100 x i8]
@@ -253,17 +248,11 @@ define void @memcpy_alloca_dst(ptr %src) {
; BULK-MEM-NEXT: .functype memset_alloca (i32) -> ()
; BULK-MEM-NEXT: global.get $push[[L1:[0-9]+]]=, __stack_pointer
; BULK-MEM-NEXT: i64.const $push[[L0:[0-9]+]]=, 112
-; BULK-MEM-NEXT: i64.sub $1=, $pop[[L1]], $pop[[L0]]
-; BULK-MEM-NEXT: block
-; BULK-MEM-NEXT: i64.const $push[[L2:[0-9]+]]=, 100
-; BULK-MEM-NEXT: i64.eqz $push[[L3:[0-9]+]]=, $pop[[L2]]
-; BULK-MEM-NEXT: br_if 0, $pop[[L3]]
-; BULK-MEM-NEXT: i64.const $push[[L4:[0-9]+]]=, 12
-; BULK-MEM-NEXT: i64.add $push[[L5:[0-9]+]]=, $1, $pop[[L4]]
-; BULK-MEM-NEXT: i64.const $push[[L6:[0-9]+]]=, 100
-; BULK-MEM-NEXT: memory.fill 0, $pop[[L5]], $0, $pop[[L6]]
-; BULK-MEM-NEXT: .LBB{{.*}}:
-; BULK-MEM-NEXT: end_block
+; BULK-MEM-NEXT: i64.sub $push[[L2:[0-9]+]]=, $pop[[L1]], $pop[[L0]]
+; BULK-MEM-NEXT: i64.const $push[[L3:[0-9]+]]=, 12
+; BULK-MEM-NEXT: i64.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]
+; BULK-MEM-NEXT: i64.const $push[[L5:[0-9]+]]=, 100
+; BULK-MEM-NEXT: memory.fill 0, $pop[[L4]], $0, $pop[[L5]]
; BULK-MEM-NEXT: return
define void @memset_alloca(i8 %val) {
%a = alloca [100 x i8]
diff --git a/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll b/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll
index 04a2268..314e1b4 100644
--- a/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll
+++ b/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll
@@ -1,5 +1,6 @@
; RUN: opt -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s
; RUN: opt -mattr=+simd128 -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s --check-prefix=MAX-BANDWIDTH
+; RUN: opt -mattr=+simd128,+relaxed-simd -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128,+relaxed-simd -verify-machineinstrs -o - | FileCheck %s --check-prefix=RELAXED-MAX-BANDWIDTH
target triple = "wasm32"
@@ -23,6 +24,10 @@ define hidden i32 @i32_mac_s8(ptr nocapture noundef readonly %a, ptr nocapture n
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.relaxed_dot_i8x16_i7x16_add_s
+
entry:
%cmp7.not = icmp eq i32 %N, 0
br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
@@ -47,6 +52,109 @@ for.body: ; preds = %entry, %for.body
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}
+define hidden i32 @i32_mac_u8_s8(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
+; CHECK-LABEL: i32_mac_u8_s8:
+; CHECK: loop
+; CHECK: v128.load32_zero
+; CHECK: i16x8.extend_low_i8x16_u
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: v128.load32_zero
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: i32x4.mul
+; CHECK: i32x4.add
+
+; MAX-BANDWIDTH: loop
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.mul
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.mul
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.mul
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.mul
+; MAX-BANDWIDTH: i32x4.add
+
+; RELAXED-MAX-BANDWIDTH: loop
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+entry:
+ %cmp7.not = icmp eq i32 %N, 0
+ br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
+ ret i32 %res.0.lcssa
+
+for.body: ; preds = %entry, %for.body
+ %i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %res.08 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds i8, ptr %a, i32 %i.09
+ %0 = load i8, ptr %arrayidx, align 1
+ %conv = sext i8 %0 to i32
+ %arrayidx1 = getelementptr inbounds i8, ptr %b, i32 %i.09
+ %1 = load i8, ptr %arrayidx1, align 1
+ %conv2 = zext i8 %1 to i32
+ %mul = mul nsw i32 %conv2, %conv
+ %add = add nsw i32 %mul, %res.08
+ %inc = add nuw i32 %i.09, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
define hidden i32 @i32_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
; CHECK-LABEL: i32_mac_s16:
; CHECK: i32x4.load16x4_s 0:p2align=1
@@ -57,6 +165,12 @@ define hidden i32 @i32_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture
; MAX-BANDWIDTH: v128.load
; MAX-BANDWIDTH: v128.load
; MAX-BANDWIDTH: i32x4.dot_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.dot_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.add
entry:
%cmp7.not = icmp eq i32 %N, 0
@@ -116,6 +230,31 @@ define hidden i64 @i64_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture
; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
; MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+
entry:
%cmp7.not = icmp eq i32 %N, 0
br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
@@ -156,6 +295,14 @@ define hidden i64 @i64_mac_s32(ptr nocapture noundef readonly %a, ptr nocapture
; MAX-BANDWIDTH: i64x2.extend_low_i32x4_s
; MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i64x2.extend_high_i32x4_s
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i64x2.extend_low_i32x4_s
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+
entry:
%cmp6.not = icmp eq i32 %N, 0
br i1 %cmp6.not, label %for.cond.cleanup, label %for.body
@@ -197,6 +344,15 @@ define hidden i32 @i32_mac_u8(ptr nocapture noundef readonly %a, ptr nocapture n
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i16x8.extmul_low_i8x16_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i16x8.extmul_high_i8x16_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+
entry:
%cmp7.not = icmp eq i32 %N, 0
br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
@@ -235,6 +391,13 @@ define hidden i32 @i32_mac_u16(ptr nocapture noundef readonly %a, ptr nocapture
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.extmul_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extmul_high_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+
entry:
%cmp7.not = icmp eq i32 %N, 0
br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
@@ -277,6 +440,17 @@ define hidden i32 @i32_mac_u16_s16(ptr nocapture noundef readonly %a, ptr nocapt
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_high_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+
entry:
%cmp7.not = icmp eq i32 %N, 0
br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
@@ -335,6 +509,32 @@ define hidden i64 @i64_mac_u16(ptr nocapture noundef readonly %a, ptr nocapture
; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
; MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+
+
entry:
%cmp8.not = icmp eq i32 %N, 0
br i1 %cmp8.not, label %for.cond.cleanup, label %for.body
@@ -375,6 +575,14 @@ define hidden i64 @i64_mac_u32(ptr nocapture noundef readonly %a, ptr nocapture
; MAX-BANDWIDTH: i64x2.extend_low_i32x4_u
; MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i64x2.extend_high_i32x4_u
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i64x2.extend_low_i32x4_u
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+
entry:
%cmp6.not = icmp eq i32 %N, 0
br i1 %cmp6.not, label %for.cond.cleanup, label %for.body
diff --git a/llvm/test/CodeGen/WebAssembly/mem-intrinsics-offsets.ll b/llvm/test/CodeGen/WebAssembly/mem-intrinsics-offsets.ll
new file mode 100644
index 0000000..abbd953
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/mem-intrinsics-offsets.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mcpu=mvp -wasm-disable-explicit-locals -wasm-keep-registers | FileCheck %s
+
+; This test ensures that loads and stores generated for small memcpy et al use
+; constant offset folding.
+
+
+target triple = "wasm32-unknown-unknown"
+
+define void @call_memset(ptr) #0 {
+; CHECK-LABEL: call_memset:
+; CHECK: .functype call_memset (i32) -> ()
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i64.const $push0=, 0
+; CHECK-NEXT: i64.store 8($0):p2align=0, $pop0
+; CHECK-NEXT: i64.const $push1=, 0
+; CHECK-NEXT: i64.store 0($0):p2align=0, $pop1
+; CHECK-NEXT: # fallthrough-return
+ call void @llvm.memset.p0.i32(ptr align 1 %0, i8 0, i32 16, i1 false)
+ ret void
+}
+
+define void @call_memcpy(ptr %dst, ptr %src) #0 {
+; CHECK-LABEL: call_memcpy:
+; CHECK: .functype call_memcpy (i32, i32) -> ()
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i64.load $push0=, 8($1):p2align=0
+; CHECK-NEXT: i64.store 8($0):p2align=0, $pop0
+; CHECK-NEXT: i64.load $push1=, 0($1):p2align=0
+; CHECK-NEXT: i64.store 0($0):p2align=0, $pop1
+; CHECK-NEXT: # fallthrough-return
+ call void @llvm.memcpy.p0.p0.i32(ptr align 1 %dst, ptr align 1 %src, i32 16, i1 false)
+ ret void
+}
+
+
+define void @call_memmove(ptr %dst, ptr %src) #0 {
+; CHECK-LABEL: call_memmove:
+; CHECK: .functype call_memmove (i32, i32) -> ()
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i64.load $2=, 0($1):p2align=0
+; CHECK-NEXT: i64.load $push0=, 8($1):p2align=0
+; CHECK-NEXT: i64.store 8($0):p2align=0, $pop0
+; CHECK-NEXT: i64.store 0($0):p2align=0, $2
+; CHECK-NEXT: # fallthrough-return
+ call void @llvm.memmove.p0.p0.i32(ptr align 1 %dst, ptr align 1 %src, i32 16, i1 false)
+ ret void
+}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll b/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll
new file mode 100644
index 0000000..3654aae
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll
@@ -0,0 +1,106 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mattr=+simd128 | FileCheck %s
+
+target triple = "wasm32-unknown-unknown"
+
+define <4 x i32> @dot_sext_1(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: dot_sext_1:
+; CHECK: .functype dot_sext_1 (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i32x4.dot_i16x8_s
+; CHECK-NEXT: # fallthrough-return
+ %sext1 = sext <8 x i16> %a to <8 x i32>
+ %sext2 = sext <8 x i16> %b to <8 x i32>
+ %mul = mul <8 x i32> %sext1, %sext2
+ %shuffle1 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %shuffle2 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ %res = add <4 x i32> %shuffle1, %shuffle2
+ ret <4 x i32> %res
+}
+
+
+define <4 x i32> @dot_sext_2(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: dot_sext_2:
+; CHECK: .functype dot_sext_2 (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i32x4.dot_i16x8_s
+; CHECK-NEXT: # fallthrough-return
+ %sext1 = sext <8 x i16> %a to <8 x i32>
+ %sext2 = sext <8 x i16> %b to <8 x i32>
+ %mul = mul <8 x i32> %sext1, %sext2
+ %shuffle1 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %shuffle2 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ %res = add <4 x i32> %shuffle2, %shuffle1
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @dot_sext_self(<8 x i16> %v) {
+; CHECK-LABEL: dot_sext_self:
+; CHECK: .functype dot_sext_self (v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i32x4.dot_i16x8_s
+; CHECK-NEXT: # fallthrough-return
+ %sext = sext <8 x i16> %v to <8 x i32>
+ %mul = mul <8 x i32> %sext, %sext
+ %shuffle1 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %shuffle2 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ %res = add <4 x i32> %shuffle1, %shuffle2
+ ret <4 x i32> %res
+}
+
+; INFO: Negative test
+define <4 x i32> @dot_zext(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: dot_zext:
+; CHECK: .functype dot_zext (v128, v128) -> (v128)
+; CHECK-NEXT: .local v128
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i32x4.extmul_low_i16x8_u
+; CHECK-NEXT: local.tee 2
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i32x4.extmul_high_i16x8_u
+; CHECK-NEXT: local.tee 1
+; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK-NEXT: local.get 2
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK-NEXT: i32x4.add
+; CHECK-NEXT: # fallthrough-return
+ %zext1 = zext <8 x i16> %a to <8 x i32>
+ %zext2 = zext <8 x i16> %b to <8 x i32>
+ %mul = mul <8 x i32> %zext1, %zext2
+ %shuffle1 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %shuffle2 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ %res = add <4 x i32> %shuffle1, %shuffle2
+ ret <4 x i32> %res
+}
+
+; INFO: Negative test
+define <4 x i32> @dot_wrong_shuffle(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: dot_wrong_shuffle:
+; CHECK: .functype dot_wrong_shuffle (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i32x4.extmul_low_i16x8_s
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i32x4.extmul_high_i16x8_s
+; CHECK-NEXT: i32x4.add
+; CHECK-NEXT: # fallthrough-return
+ %sext1 = sext <8 x i16> %a to <8 x i32>
+ %sext2 = sext <8 x i16> %b to <8 x i32>
+ %mul = mul <8 x i32> %sext1, %sext2
+ %shuffle1 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %shuffle2 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %res = add <4 x i32> %shuffle1, %shuffle2
+ ret <4 x i32> %res
+}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-dot.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-dot.ll
new file mode 100644
index 0000000..9716cbe
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-dot.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+fp16,+simd128,+relaxed-simd | FileCheck %s
+
+target triple = "wasm32"
+; relaxed_dot stands for relaxed_dot_i8x16_i7x16_s, as in td
+; relaxed_dot_add stands for i32x4.relaxed_dot_i8x16_i7x16_add_s, as in td
+
+define <8 x i16> @relaxed_dot_sext_1(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: relaxed_dot_sext_1:
+; CHECK: .functype relaxed_dot_sext_1 (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i16x8.relaxed_dot_i8x16_i7x16_s $push0=, $0, $1
+; CHECK-NEXT: return $pop0
+ %sext1 = sext <16 x i8> %a to <16 x i16>
+ %sext2 = sext <16 x i8> %b to <16 x i16>
+ %mul = mul <16 x i16> %sext1, %sext2
+ %shuffle1 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %shuffle2 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %res = add <8 x i16> %shuffle1, %shuffle2
+ ret <8 x i16> %res
+}
+
+
+define <8 x i16> @relaxed_dot_sext_2(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: relaxed_dot_sext_2:
+; CHECK: .functype relaxed_dot_sext_2 (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i16x8.relaxed_dot_i8x16_i7x16_s $push0=, $0, $1
+; CHECK-NEXT: return $pop0
+ %sext1 = sext <16 x i8> %a to <16 x i16>
+ %sext2 = sext <16 x i8> %b to <16 x i16>
+ %mul = mul <16 x i16> %sext1, %sext2
+ %shuffle1 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %shuffle2 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %res = add <8 x i16> %shuffle2, %shuffle1
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @relaxed_dot_sext_self(<16 x i8> %v) {
+; CHECK-LABEL: relaxed_dot_sext_self:
+; CHECK: .functype relaxed_dot_sext_self (v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i16x8.relaxed_dot_i8x16_i7x16_s $push0=, $0, $0
+; CHECK-NEXT: return $pop0
+ %sext = sext <16 x i8> %v to <16 x i16>
+ %mul = mul <16 x i16> %sext, %sext
+ %shuffle1 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %shuffle2 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %res = add <8 x i16> %shuffle1, %shuffle2
+ ret <8 x i16> %res
+}
+
+define <4 x i32> @relaxed_dot_add_from_relaxed_dot(<16 x i8> %a, <16 x i8> %b, <4 x i32> %c) {
+; CHECK-LABEL: relaxed_dot_add_from_relaxed_dot:
+; CHECK: .functype relaxed_dot_add_from_relaxed_dot (v128, v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i32x4.relaxed_dot_i8x16_i7x16_add_s $push0=, $0, $1, $2
+; CHECK-NEXT: return $pop0
+ %relaxed_dot_call = call <8 x i16> @llvm.wasm.relaxed.dot.i8x16.i7x16.signed(<16 x i8> %a, <16 x i8> %b)
+ %sext = call <4 x i32> @llvm.wasm.extadd.pairwise.signed.v4i32(<8 x i16> %relaxed_dot_call)
+ %res = add <4 x i32> %sext, %c
+ ret <4 x i32> %res
+}
+
+; INFO: Negative test
+define <8 x i16> @relaxed_dot_zext(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: relaxed_dot_zext:
+; CHECK: .functype relaxed_dot_zext (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i16x8.extmul_low_i8x16_u $push6=, $0, $1
+; CHECK-NEXT: local.tee $push5=, $2=, $pop6
+; CHECK-NEXT: i16x8.extmul_high_i8x16_u $push4=, $0, $1
+; CHECK-NEXT: local.tee $push3=, $1=, $pop4
+; CHECK-NEXT: i8x16.shuffle $push1=, $pop5, $pop3, 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
+; CHECK-NEXT: i8x16.shuffle $push0=, $2, $1, 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
+; CHECK-NEXT: i16x8.add $push2=, $pop1, $pop0
+; CHECK-NEXT: return $pop2
+ %zext1 = zext <16 x i8> %a to <16 x i16>
+ %zext2 = zext <16 x i8> %b to <16 x i16>
+ %mul = mul <16 x i16> %zext1, %zext2
+ %shuffle1 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %shuffle2 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %res = add <8 x i16> %shuffle1, %shuffle2
+ ret <8 x i16> %res
+
+}
+
+; INFO: Negative test
+define <8 x i16> @relaxed_dot_wrong_shuffle(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: relaxed_dot_wrong_shuffle:
+; CHECK: .functype relaxed_dot_wrong_shuffle (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i16x8.extmul_low_i8x16_s $push1=, $0, $1
+; CHECK-NEXT: i16x8.extmul_high_i8x16_s $push0=, $0, $1
+; CHECK-NEXT: i16x8.add $push2=, $pop1, $pop0
+; CHECK-NEXT: return $pop2
+ %sext1 = sext <16 x i8> %a to <16 x i16>
+ %sext2 = sext <16 x i8> %b to <16 x i16>
+ %mul = mul <16 x i16> %sext1, %sext2
+ %shuffle1 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %shuffle2 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %res = add <8 x i16> %shuffle1, %shuffle2
+ ret <8 x i16> %res
+}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
index e065de3..600241a 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
@@ -2,9 +2,278 @@
; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+fp16,+simd128,+relaxed-simd | FileCheck %s --check-prefix=RELAXED
; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+fp16,+simd128, | FileCheck %s --check-prefix=STRICT
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s --check-prefix=NOFP16
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers | FileCheck %s --check-prefix=NOSIMD
target triple = "wasm32"
+define half @fadd_fmul_contract_f16(half %a, half %b, half %c) {
+; RELAXED-LABEL: fadd_fmul_contract_f16:
+; RELAXED: .functype fadd_fmul_contract_f16 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT: # %bb.0:
+; RELAXED-NEXT: call $push0=, __truncsfhf2, $0
+; RELAXED-NEXT: call $push1=, __extendhfsf2, $pop0
+; RELAXED-NEXT: call $push2=, __truncsfhf2, $1
+; RELAXED-NEXT: call $push3=, __extendhfsf2, $pop2
+; RELAXED-NEXT: f32.mul $push4=, $pop1, $pop3
+; RELAXED-NEXT: call $push5=, __truncsfhf2, $2
+; RELAXED-NEXT: call $push6=, __extendhfsf2, $pop5
+; RELAXED-NEXT: f32.add $push7=, $pop4, $pop6
+; RELAXED-NEXT: return $pop7
+;
+; STRICT-LABEL: fadd_fmul_contract_f16:
+; STRICT: .functype fadd_fmul_contract_f16 (f32, f32, f32) -> (f32)
+; STRICT-NEXT: # %bb.0:
+; STRICT-NEXT: call $push0=, __truncsfhf2, $0
+; STRICT-NEXT: call $push1=, __extendhfsf2, $pop0
+; STRICT-NEXT: call $push2=, __truncsfhf2, $1
+; STRICT-NEXT: call $push3=, __extendhfsf2, $pop2
+; STRICT-NEXT: f32.mul $push4=, $pop1, $pop3
+; STRICT-NEXT: call $push5=, __truncsfhf2, $2
+; STRICT-NEXT: call $push6=, __extendhfsf2, $pop5
+; STRICT-NEXT: f32.add $push7=, $pop4, $pop6
+; STRICT-NEXT: return $pop7
+;
+; NOFP16-LABEL: fadd_fmul_contract_f16:
+; NOFP16: .functype fadd_fmul_contract_f16 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: call $push0=, __truncsfhf2, $0
+; NOFP16-NEXT: call $push1=, __extendhfsf2, $pop0
+; NOFP16-NEXT: call $push2=, __truncsfhf2, $1
+; NOFP16-NEXT: call $push3=, __extendhfsf2, $pop2
+; NOFP16-NEXT: f32.mul $push4=, $pop1, $pop3
+; NOFP16-NEXT: call $push5=, __truncsfhf2, $2
+; NOFP16-NEXT: call $push6=, __extendhfsf2, $pop5
+; NOFP16-NEXT: f32.add $push7=, $pop4, $pop6
+; NOFP16-NEXT: return $pop7
+;
+; NOSIMD-LABEL: fadd_fmul_contract_f16:
+; NOSIMD: .functype fadd_fmul_contract_f16 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: call $push0=, __truncsfhf2, $0
+; NOSIMD-NEXT: call $push1=, __extendhfsf2, $pop0
+; NOSIMD-NEXT: call $push2=, __truncsfhf2, $1
+; NOSIMD-NEXT: call $push3=, __extendhfsf2, $pop2
+; NOSIMD-NEXT: f32.mul $push4=, $pop1, $pop3
+; NOSIMD-NEXT: call $push5=, __truncsfhf2, $2
+; NOSIMD-NEXT: call $push6=, __extendhfsf2, $pop5
+; NOSIMD-NEXT: f32.add $push7=, $pop4, $pop6
+; NOSIMD-NEXT: return $pop7
+ %mul = fmul contract half %b, %a
+ %add = fadd contract half %mul, %c
+ ret half %add
+}
+
+define half @fmuladd_contract_f16(half %a, half %b, half %c) {
+; RELAXED-LABEL: fmuladd_contract_f16:
+; RELAXED: .functype fmuladd_contract_f16 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT: # %bb.0:
+; RELAXED-NEXT: call $push0=, __truncsfhf2, $1
+; RELAXED-NEXT: call $push1=, __extendhfsf2, $pop0
+; RELAXED-NEXT: call $push2=, __truncsfhf2, $0
+; RELAXED-NEXT: call $push3=, __extendhfsf2, $pop2
+; RELAXED-NEXT: f32.mul $push4=, $pop1, $pop3
+; RELAXED-NEXT: call $push5=, __truncsfhf2, $2
+; RELAXED-NEXT: call $push6=, __extendhfsf2, $pop5
+; RELAXED-NEXT: f32.add $push7=, $pop4, $pop6
+; RELAXED-NEXT: return $pop7
+;
+; STRICT-LABEL: fmuladd_contract_f16:
+; STRICT: .functype fmuladd_contract_f16 (f32, f32, f32) -> (f32)
+; STRICT-NEXT: # %bb.0:
+; STRICT-NEXT: call $push0=, __truncsfhf2, $1
+; STRICT-NEXT: call $push1=, __extendhfsf2, $pop0
+; STRICT-NEXT: call $push2=, __truncsfhf2, $0
+; STRICT-NEXT: call $push3=, __extendhfsf2, $pop2
+; STRICT-NEXT: f32.mul $push4=, $pop1, $pop3
+; STRICT-NEXT: call $push5=, __truncsfhf2, $2
+; STRICT-NEXT: call $push6=, __extendhfsf2, $pop5
+; STRICT-NEXT: f32.add $push7=, $pop4, $pop6
+; STRICT-NEXT: return $pop7
+;
+; NOFP16-LABEL: fmuladd_contract_f16:
+; NOFP16: .functype fmuladd_contract_f16 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: call $push0=, __truncsfhf2, $1
+; NOFP16-NEXT: call $push1=, __extendhfsf2, $pop0
+; NOFP16-NEXT: call $push2=, __truncsfhf2, $0
+; NOFP16-NEXT: call $push3=, __extendhfsf2, $pop2
+; NOFP16-NEXT: f32.mul $push4=, $pop1, $pop3
+; NOFP16-NEXT: call $push5=, __truncsfhf2, $2
+; NOFP16-NEXT: call $push6=, __extendhfsf2, $pop5
+; NOFP16-NEXT: f32.add $push7=, $pop4, $pop6
+; NOFP16-NEXT: return $pop7
+;
+; NOSIMD-LABEL: fmuladd_contract_f16:
+; NOSIMD: .functype fmuladd_contract_f16 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: call $push0=, __truncsfhf2, $1
+; NOSIMD-NEXT: call $push1=, __extendhfsf2, $pop0
+; NOSIMD-NEXT: call $push2=, __truncsfhf2, $0
+; NOSIMD-NEXT: call $push3=, __extendhfsf2, $pop2
+; NOSIMD-NEXT: f32.mul $push4=, $pop1, $pop3
+; NOSIMD-NEXT: call $push5=, __truncsfhf2, $2
+; NOSIMD-NEXT: call $push6=, __extendhfsf2, $pop5
+; NOSIMD-NEXT: f32.add $push7=, $pop4, $pop6
+; NOSIMD-NEXT: return $pop7
+ %fma = call contract half @llvm.fmuladd(half %a, half %b, half %c)
+ ret half %fma
+}
+
+define half @fmuladd_f16(half %a, half %b, half %c) {
+; RELAXED-LABEL: fmuladd_f16:
+; RELAXED: .functype fmuladd_f16 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT: # %bb.0:
+; RELAXED-NEXT: call $push0=, __truncsfhf2, $1
+; RELAXED-NEXT: call $push1=, __extendhfsf2, $pop0
+; RELAXED-NEXT: call $push2=, __truncsfhf2, $0
+; RELAXED-NEXT: call $push3=, __extendhfsf2, $pop2
+; RELAXED-NEXT: f32.mul $push4=, $pop1, $pop3
+; RELAXED-NEXT: call $push5=, __truncsfhf2, $2
+; RELAXED-NEXT: call $push6=, __extendhfsf2, $pop5
+; RELAXED-NEXT: f32.add $push7=, $pop4, $pop6
+; RELAXED-NEXT: return $pop7
+;
+; STRICT-LABEL: fmuladd_f16:
+; STRICT: .functype fmuladd_f16 (f32, f32, f32) -> (f32)
+; STRICT-NEXT: # %bb.0:
+; STRICT-NEXT: call $push0=, __truncsfhf2, $1
+; STRICT-NEXT: call $push1=, __extendhfsf2, $pop0
+; STRICT-NEXT: call $push2=, __truncsfhf2, $0
+; STRICT-NEXT: call $push3=, __extendhfsf2, $pop2
+; STRICT-NEXT: f32.mul $push4=, $pop1, $pop3
+; STRICT-NEXT: call $push5=, __truncsfhf2, $2
+; STRICT-NEXT: call $push6=, __extendhfsf2, $pop5
+; STRICT-NEXT: f32.add $push7=, $pop4, $pop6
+; STRICT-NEXT: return $pop7
+;
+; NOFP16-LABEL: fmuladd_f16:
+; NOFP16: .functype fmuladd_f16 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: call $push0=, __truncsfhf2, $1
+; NOFP16-NEXT: call $push1=, __extendhfsf2, $pop0
+; NOFP16-NEXT: call $push2=, __truncsfhf2, $0
+; NOFP16-NEXT: call $push3=, __extendhfsf2, $pop2
+; NOFP16-NEXT: f32.mul $push4=, $pop1, $pop3
+; NOFP16-NEXT: call $push5=, __truncsfhf2, $2
+; NOFP16-NEXT: call $push6=, __extendhfsf2, $pop5
+; NOFP16-NEXT: f32.add $push7=, $pop4, $pop6
+; NOFP16-NEXT: return $pop7
+;
+; NOSIMD-LABEL: fmuladd_f16:
+; NOSIMD: .functype fmuladd_f16 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: call $push0=, __truncsfhf2, $1
+; NOSIMD-NEXT: call $push1=, __extendhfsf2, $pop0
+; NOSIMD-NEXT: call $push2=, __truncsfhf2, $0
+; NOSIMD-NEXT: call $push3=, __extendhfsf2, $pop2
+; NOSIMD-NEXT: f32.mul $push4=, $pop1, $pop3
+; NOSIMD-NEXT: call $push5=, __truncsfhf2, $2
+; NOSIMD-NEXT: call $push6=, __extendhfsf2, $pop5
+; NOSIMD-NEXT: f32.add $push7=, $pop4, $pop6
+; NOSIMD-NEXT: return $pop7
+ %fma = call half @llvm.fmuladd(half %a, half %b, half %c)
+ ret half %fma
+}
+
+
+define float @fadd_fmul_contract_f32(float %a, float %b, float %c) {
+; RELAXED-LABEL: fadd_fmul_contract_f32:
+; RELAXED: .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT: # %bb.0:
+; RELAXED-NEXT: f32.mul $push0=, $1, $0
+; RELAXED-NEXT: f32.add $push1=, $pop0, $2
+; RELAXED-NEXT: return $pop1
+;
+; STRICT-LABEL: fadd_fmul_contract_f32:
+; STRICT: .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32)
+; STRICT-NEXT: # %bb.0:
+; STRICT-NEXT: f32.mul $push0=, $1, $0
+; STRICT-NEXT: f32.add $push1=, $pop0, $2
+; STRICT-NEXT: return $pop1
+;
+; NOFP16-LABEL: fadd_fmul_contract_f32:
+; NOFP16: .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: f32.mul $push0=, $1, $0
+; NOFP16-NEXT: f32.add $push1=, $pop0, $2
+; NOFP16-NEXT: return $pop1
+;
+; NOSIMD-LABEL: fadd_fmul_contract_f32:
+; NOSIMD: .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: f32.mul $push0=, $1, $0
+; NOSIMD-NEXT: f32.add $push1=, $pop0, $2
+; NOSIMD-NEXT: return $pop1
+ %mul = fmul contract float %b, %a
+ %add = fadd contract float %mul, %c
+ ret float %add
+}
+
+define float @fmuladd_contract_f32(float %a, float %b, float %c) {
+; RELAXED-LABEL: fmuladd_contract_f32:
+; RELAXED: .functype fmuladd_contract_f32 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT: # %bb.0:
+; RELAXED-NEXT: f32.mul $push0=, $0, $1
+; RELAXED-NEXT: f32.add $push1=, $pop0, $2
+; RELAXED-NEXT: return $pop1
+;
+; STRICT-LABEL: fmuladd_contract_f32:
+; STRICT: .functype fmuladd_contract_f32 (f32, f32, f32) -> (f32)
+; STRICT-NEXT: # %bb.0:
+; STRICT-NEXT: f32.mul $push0=, $0, $1
+; STRICT-NEXT: f32.add $push1=, $pop0, $2
+; STRICT-NEXT: return $pop1
+;
+; NOFP16-LABEL: fmuladd_contract_f32:
+; NOFP16: .functype fmuladd_contract_f32 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: f32.mul $push0=, $0, $1
+; NOFP16-NEXT: f32.add $push1=, $pop0, $2
+; NOFP16-NEXT: return $pop1
+;
+; NOSIMD-LABEL: fmuladd_contract_f32:
+; NOSIMD: .functype fmuladd_contract_f32 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: f32.mul $push0=, $0, $1
+; NOSIMD-NEXT: f32.add $push1=, $pop0, $2
+; NOSIMD-NEXT: return $pop1
+ %fma = call contract float @llvm.fmuladd(float %a, float %b, float %c)
+ ret float %fma
+}
+
+define float @fmuladd_f32(float %a, float %b, float %c) {
+; RELAXED-LABEL: fmuladd_f32:
+; RELAXED: .functype fmuladd_f32 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT: # %bb.0:
+; RELAXED-NEXT: f32.mul $push0=, $0, $1
+; RELAXED-NEXT: f32.add $push1=, $pop0, $2
+; RELAXED-NEXT: return $pop1
+;
+; STRICT-LABEL: fmuladd_f32:
+; STRICT: .functype fmuladd_f32 (f32, f32, f32) -> (f32)
+; STRICT-NEXT: # %bb.0:
+; STRICT-NEXT: f32.mul $push0=, $0, $1
+; STRICT-NEXT: f32.add $push1=, $pop0, $2
+; STRICT-NEXT: return $pop1
+;
+; NOFP16-LABEL: fmuladd_f32:
+; NOFP16: .functype fmuladd_f32 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: f32.mul $push0=, $0, $1
+; NOFP16-NEXT: f32.add $push1=, $pop0, $2
+; NOFP16-NEXT: return $pop1
+;
+; NOSIMD-LABEL: fmuladd_f32:
+; NOSIMD: .functype fmuladd_f32 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: f32.mul $push0=, $0, $1
+; NOSIMD-NEXT: f32.add $push1=, $pop0, $2
+; NOSIMD-NEXT: return $pop1
+ %fma = call float @llvm.fmuladd(float %a, float %b, float %c)
+ ret float %fma
+}
+
define double @fadd_fmul_contract_f64(double %a, double %b, double %c) {
; RELAXED-LABEL: fadd_fmul_contract_f64:
; RELAXED: .functype fadd_fmul_contract_f64 (f64, f64, f64) -> (f64)
@@ -19,16 +288,94 @@ define double @fadd_fmul_contract_f64(double %a, double %b, double %c) {
; STRICT-NEXT: f64.mul $push0=, $1, $0
; STRICT-NEXT: f64.add $push1=, $pop0, $2
; STRICT-NEXT: return $pop1
+;
+; NOFP16-LABEL: fadd_fmul_contract_f64:
+; NOFP16: .functype fadd_fmul_contract_f64 (f64, f64, f64) -> (f64)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: f64.mul $push0=, $1, $0
+; NOFP16-NEXT: f64.add $push1=, $pop0, $2
+; NOFP16-NEXT: return $pop1
+;
+; NOSIMD-LABEL: fadd_fmul_contract_f64:
+; NOSIMD: .functype fadd_fmul_contract_f64 (f64, f64, f64) -> (f64)
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: f64.mul $push0=, $1, $0
+; NOSIMD-NEXT: f64.add $push1=, $pop0, $2
+; NOSIMD-NEXT: return $pop1
%mul = fmul contract double %b, %a
%add = fadd contract double %mul, %c
ret double %add
}
+define double @fmuladd_f64(double %a, double %b, double %c) {
+; RELAXED-LABEL: fmuladd_f64:
+; RELAXED: .functype fmuladd_f64 (f64, f64, f64) -> (f64)
+; RELAXED-NEXT: # %bb.0:
+; RELAXED-NEXT: f64.mul $push0=, $0, $1
+; RELAXED-NEXT: f64.add $push1=, $pop0, $2
+; RELAXED-NEXT: return $pop1
+;
+; STRICT-LABEL: fmuladd_f64:
+; STRICT: .functype fmuladd_f64 (f64, f64, f64) -> (f64)
+; STRICT-NEXT: # %bb.0:
+; STRICT-NEXT: f64.mul $push0=, $0, $1
+; STRICT-NEXT: f64.add $push1=, $pop0, $2
+; STRICT-NEXT: return $pop1
+;
+; NOFP16-LABEL: fmuladd_f64:
+; NOFP16: .functype fmuladd_f64 (f64, f64, f64) -> (f64)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: f64.mul $push0=, $0, $1
+; NOFP16-NEXT: f64.add $push1=, $pop0, $2
+; NOFP16-NEXT: return $pop1
+;
+; NOSIMD-LABEL: fmuladd_f64:
+; NOSIMD: .functype fmuladd_f64 (f64, f64, f64) -> (f64)
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: f64.mul $push0=, $0, $1
+; NOSIMD-NEXT: f64.add $push1=, $pop0, $2
+; NOSIMD-NEXT: return $pop1
+ %fma = call double @llvm.fmuladd(double %a, double %b, double %c)
+ ret double %fma
+}
+
+define double @fmuladd_contract_f64(double %a, double %b, double %c) {
+; RELAXED-LABEL: fmuladd_contract_f64:
+; RELAXED: .functype fmuladd_contract_f64 (f64, f64, f64) -> (f64)
+; RELAXED-NEXT: # %bb.0:
+; RELAXED-NEXT: f64.mul $push0=, $0, $1
+; RELAXED-NEXT: f64.add $push1=, $pop0, $2
+; RELAXED-NEXT: return $pop1
+;
+; STRICT-LABEL: fmuladd_contract_f64:
+; STRICT: .functype fmuladd_contract_f64 (f64, f64, f64) -> (f64)
+; STRICT-NEXT: # %bb.0:
+; STRICT-NEXT: f64.mul $push0=, $0, $1
+; STRICT-NEXT: f64.add $push1=, $pop0, $2
+; STRICT-NEXT: return $pop1
+;
+; NOFP16-LABEL: fmuladd_contract_f64:
+; NOFP16: .functype fmuladd_contract_f64 (f64, f64, f64) -> (f64)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: f64.mul $push0=, $0, $1
+; NOFP16-NEXT: f64.add $push1=, $pop0, $2
+; NOFP16-NEXT: return $pop1
+;
+; NOSIMD-LABEL: fmuladd_contract_f64:
+; NOSIMD: .functype fmuladd_contract_f64 (f64, f64, f64) -> (f64)
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: f64.mul $push0=, $0, $1
+; NOSIMD-NEXT: f64.add $push1=, $pop0, $2
+; NOSIMD-NEXT: return $pop1
+ %fma = call contract double @llvm.fmuladd(double %a, double %b, double %c)
+ ret double %fma
+}
+
define <4 x float> @fadd_fmul_contract_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
; RELAXED-LABEL: fadd_fmul_contract_4xf32:
; RELAXED: .functype fadd_fmul_contract_4xf32 (v128, v128, v128) -> (v128)
; RELAXED-NEXT: # %bb.0:
-; RELAXED-NEXT: f32x4.relaxed_madd $push0=, $2, $1, $0
+; RELAXED-NEXT: f32x4.relaxed_madd $push0=, $1, $0, $2
; RELAXED-NEXT: return $pop0
;
; STRICT-LABEL: fadd_fmul_contract_4xf32:
@@ -37,31 +384,222 @@ define <4 x float> @fadd_fmul_contract_4xf32(<4 x float> %a, <4 x float> %b, <4
; STRICT-NEXT: f32x4.mul $push0=, $1, $0
; STRICT-NEXT: f32x4.add $push1=, $pop0, $2
; STRICT-NEXT: return $pop1
+;
+; NOFP16-LABEL: fadd_fmul_contract_4xf32:
+; NOFP16: .functype fadd_fmul_contract_4xf32 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: f32x4.mul $push0=, $1, $0
+; NOFP16-NEXT: f32x4.add $push1=, $pop0, $2
+; NOFP16-NEXT: return $pop1
+;
+; NOSIMD-LABEL: fadd_fmul_contract_4xf32:
+; NOSIMD: .functype fadd_fmul_contract_4xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: f32.mul $push0=, $8, $4
+; NOSIMD-NEXT: f32.add $push1=, $pop0, $12
+; NOSIMD-NEXT: f32.store 12($0), $pop1
+; NOSIMD-NEXT: f32.mul $push2=, $7, $3
+; NOSIMD-NEXT: f32.add $push3=, $pop2, $11
+; NOSIMD-NEXT: f32.store 8($0), $pop3
+; NOSIMD-NEXT: f32.mul $push4=, $6, $2
+; NOSIMD-NEXT: f32.add $push5=, $pop4, $10
+; NOSIMD-NEXT: f32.store 4($0), $pop5
+; NOSIMD-NEXT: f32.mul $push6=, $5, $1
+; NOSIMD-NEXT: f32.add $push7=, $pop6, $9
+; NOSIMD-NEXT: f32.store 0($0), $pop7
+; NOSIMD-NEXT: return
%mul = fmul contract <4 x float> %b, %a
%add = fadd contract <4 x float> %mul, %c
ret <4 x float> %add
}
-
define <8 x half> @fadd_fmul_contract_8xf16(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
; RELAXED-LABEL: fadd_fmul_contract_8xf16:
; RELAXED: .functype fadd_fmul_contract_8xf16 (v128, v128, v128) -> (v128)
; RELAXED-NEXT: # %bb.0:
-; RELAXED-NEXT: f16x8.relaxed_madd $push0=, $2, $1, $0
+; RELAXED-NEXT: f16x8.madd $push0=, $1, $0, $2
; RELAXED-NEXT: return $pop0
;
; STRICT-LABEL: fadd_fmul_contract_8xf16:
; STRICT: .functype fadd_fmul_contract_8xf16 (v128, v128, v128) -> (v128)
; STRICT-NEXT: # %bb.0:
-; STRICT-NEXT: f16x8.mul $push0=, $1, $0
-; STRICT-NEXT: f16x8.add $push1=, $pop0, $2
-; STRICT-NEXT: return $pop1
+; STRICT-NEXT: f16x8.madd $push0=, $1, $0, $2
+; STRICT-NEXT: return $pop0
+;
+; NOFP16-LABEL: fadd_fmul_contract_8xf16:
+; NOFP16: .functype fadd_fmul_contract_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: call $push0=, __truncsfhf2, $8
+; NOFP16-NEXT: call $push1=, __extendhfsf2, $pop0
+; NOFP16-NEXT: call $push2=, __truncsfhf2, $16
+; NOFP16-NEXT: call $push3=, __extendhfsf2, $pop2
+; NOFP16-NEXT: f32.mul $push4=, $pop1, $pop3
+; NOFP16-NEXT: call $push5=, __truncsfhf2, $24
+; NOFP16-NEXT: call $push6=, __extendhfsf2, $pop5
+; NOFP16-NEXT: f32.add $push7=, $pop4, $pop6
+; NOFP16-NEXT: call $push8=, __truncsfhf2, $pop7
+; NOFP16-NEXT: i32.store16 14($0), $pop8
+; NOFP16-NEXT: call $push9=, __truncsfhf2, $7
+; NOFP16-NEXT: call $push10=, __extendhfsf2, $pop9
+; NOFP16-NEXT: call $push11=, __truncsfhf2, $15
+; NOFP16-NEXT: call $push12=, __extendhfsf2, $pop11
+; NOFP16-NEXT: f32.mul $push13=, $pop10, $pop12
+; NOFP16-NEXT: call $push14=, __truncsfhf2, $23
+; NOFP16-NEXT: call $push15=, __extendhfsf2, $pop14
+; NOFP16-NEXT: f32.add $push16=, $pop13, $pop15
+; NOFP16-NEXT: call $push17=, __truncsfhf2, $pop16
+; NOFP16-NEXT: i32.store16 12($0), $pop17
+; NOFP16-NEXT: call $push18=, __truncsfhf2, $6
+; NOFP16-NEXT: call $push19=, __extendhfsf2, $pop18
+; NOFP16-NEXT: call $push20=, __truncsfhf2, $14
+; NOFP16-NEXT: call $push21=, __extendhfsf2, $pop20
+; NOFP16-NEXT: f32.mul $push22=, $pop19, $pop21
+; NOFP16-NEXT: call $push23=, __truncsfhf2, $22
+; NOFP16-NEXT: call $push24=, __extendhfsf2, $pop23
+; NOFP16-NEXT: f32.add $push25=, $pop22, $pop24
+; NOFP16-NEXT: call $push26=, __truncsfhf2, $pop25
+; NOFP16-NEXT: i32.store16 10($0), $pop26
+; NOFP16-NEXT: call $push27=, __truncsfhf2, $5
+; NOFP16-NEXT: call $push28=, __extendhfsf2, $pop27
+; NOFP16-NEXT: call $push29=, __truncsfhf2, $13
+; NOFP16-NEXT: call $push30=, __extendhfsf2, $pop29
+; NOFP16-NEXT: f32.mul $push31=, $pop28, $pop30
+; NOFP16-NEXT: call $push32=, __truncsfhf2, $21
+; NOFP16-NEXT: call $push33=, __extendhfsf2, $pop32
+; NOFP16-NEXT: f32.add $push34=, $pop31, $pop33
+; NOFP16-NEXT: call $push35=, __truncsfhf2, $pop34
+; NOFP16-NEXT: i32.store16 8($0), $pop35
+; NOFP16-NEXT: call $push36=, __truncsfhf2, $4
+; NOFP16-NEXT: call $push37=, __extendhfsf2, $pop36
+; NOFP16-NEXT: call $push38=, __truncsfhf2, $12
+; NOFP16-NEXT: call $push39=, __extendhfsf2, $pop38
+; NOFP16-NEXT: f32.mul $push40=, $pop37, $pop39
+; NOFP16-NEXT: call $push41=, __truncsfhf2, $20
+; NOFP16-NEXT: call $push42=, __extendhfsf2, $pop41
+; NOFP16-NEXT: f32.add $push43=, $pop40, $pop42
+; NOFP16-NEXT: call $push44=, __truncsfhf2, $pop43
+; NOFP16-NEXT: i32.store16 6($0), $pop44
+; NOFP16-NEXT: call $push45=, __truncsfhf2, $3
+; NOFP16-NEXT: call $push46=, __extendhfsf2, $pop45
+; NOFP16-NEXT: call $push47=, __truncsfhf2, $11
+; NOFP16-NEXT: call $push48=, __extendhfsf2, $pop47
+; NOFP16-NEXT: f32.mul $push49=, $pop46, $pop48
+; NOFP16-NEXT: call $push50=, __truncsfhf2, $19
+; NOFP16-NEXT: call $push51=, __extendhfsf2, $pop50
+; NOFP16-NEXT: f32.add $push52=, $pop49, $pop51
+; NOFP16-NEXT: call $push53=, __truncsfhf2, $pop52
+; NOFP16-NEXT: i32.store16 4($0), $pop53
+; NOFP16-NEXT: call $push54=, __truncsfhf2, $2
+; NOFP16-NEXT: call $push55=, __extendhfsf2, $pop54
+; NOFP16-NEXT: call $push56=, __truncsfhf2, $10
+; NOFP16-NEXT: call $push57=, __extendhfsf2, $pop56
+; NOFP16-NEXT: f32.mul $push58=, $pop55, $pop57
+; NOFP16-NEXT: call $push59=, __truncsfhf2, $18
+; NOFP16-NEXT: call $push60=, __extendhfsf2, $pop59
+; NOFP16-NEXT: f32.add $push61=, $pop58, $pop60
+; NOFP16-NEXT: call $push62=, __truncsfhf2, $pop61
+; NOFP16-NEXT: i32.store16 2($0), $pop62
+; NOFP16-NEXT: call $push63=, __truncsfhf2, $1
+; NOFP16-NEXT: call $push64=, __extendhfsf2, $pop63
+; NOFP16-NEXT: call $push65=, __truncsfhf2, $9
+; NOFP16-NEXT: call $push66=, __extendhfsf2, $pop65
+; NOFP16-NEXT: f32.mul $push67=, $pop64, $pop66
+; NOFP16-NEXT: call $push68=, __truncsfhf2, $17
+; NOFP16-NEXT: call $push69=, __extendhfsf2, $pop68
+; NOFP16-NEXT: f32.add $push70=, $pop67, $pop69
+; NOFP16-NEXT: call $push71=, __truncsfhf2, $pop70
+; NOFP16-NEXT: i32.store16 0($0), $pop71
+; NOFP16-NEXT: return
+;
+; NOSIMD-LABEL: fadd_fmul_contract_8xf16:
+; NOSIMD: .functype fadd_fmul_contract_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: call $push0=, __truncsfhf2, $8
+; NOSIMD-NEXT: call $push1=, __extendhfsf2, $pop0
+; NOSIMD-NEXT: call $push2=, __truncsfhf2, $16
+; NOSIMD-NEXT: call $push3=, __extendhfsf2, $pop2
+; NOSIMD-NEXT: f32.mul $push4=, $pop1, $pop3
+; NOSIMD-NEXT: call $push5=, __truncsfhf2, $24
+; NOSIMD-NEXT: call $push6=, __extendhfsf2, $pop5
+; NOSIMD-NEXT: f32.add $push7=, $pop4, $pop6
+; NOSIMD-NEXT: call $push8=, __truncsfhf2, $pop7
+; NOSIMD-NEXT: i32.store16 14($0), $pop8
+; NOSIMD-NEXT: call $push9=, __truncsfhf2, $7
+; NOSIMD-NEXT: call $push10=, __extendhfsf2, $pop9
+; NOSIMD-NEXT: call $push11=, __truncsfhf2, $15
+; NOSIMD-NEXT: call $push12=, __extendhfsf2, $pop11
+; NOSIMD-NEXT: f32.mul $push13=, $pop10, $pop12
+; NOSIMD-NEXT: call $push14=, __truncsfhf2, $23
+; NOSIMD-NEXT: call $push15=, __extendhfsf2, $pop14
+; NOSIMD-NEXT: f32.add $push16=, $pop13, $pop15
+; NOSIMD-NEXT: call $push17=, __truncsfhf2, $pop16
+; NOSIMD-NEXT: i32.store16 12($0), $pop17
+; NOSIMD-NEXT: call $push18=, __truncsfhf2, $6
+; NOSIMD-NEXT: call $push19=, __extendhfsf2, $pop18
+; NOSIMD-NEXT: call $push20=, __truncsfhf2, $14
+; NOSIMD-NEXT: call $push21=, __extendhfsf2, $pop20
+; NOSIMD-NEXT: f32.mul $push22=, $pop19, $pop21
+; NOSIMD-NEXT: call $push23=, __truncsfhf2, $22
+; NOSIMD-NEXT: call $push24=, __extendhfsf2, $pop23
+; NOSIMD-NEXT: f32.add $push25=, $pop22, $pop24
+; NOSIMD-NEXT: call $push26=, __truncsfhf2, $pop25
+; NOSIMD-NEXT: i32.store16 10($0), $pop26
+; NOSIMD-NEXT: call $push27=, __truncsfhf2, $5
+; NOSIMD-NEXT: call $push28=, __extendhfsf2, $pop27
+; NOSIMD-NEXT: call $push29=, __truncsfhf2, $13
+; NOSIMD-NEXT: call $push30=, __extendhfsf2, $pop29
+; NOSIMD-NEXT: f32.mul $push31=, $pop28, $pop30
+; NOSIMD-NEXT: call $push32=, __truncsfhf2, $21
+; NOSIMD-NEXT: call $push33=, __extendhfsf2, $pop32
+; NOSIMD-NEXT: f32.add $push34=, $pop31, $pop33
+; NOSIMD-NEXT: call $push35=, __truncsfhf2, $pop34
+; NOSIMD-NEXT: i32.store16 8($0), $pop35
+; NOSIMD-NEXT: call $push36=, __truncsfhf2, $4
+; NOSIMD-NEXT: call $push37=, __extendhfsf2, $pop36
+; NOSIMD-NEXT: call $push38=, __truncsfhf2, $12
+; NOSIMD-NEXT: call $push39=, __extendhfsf2, $pop38
+; NOSIMD-NEXT: f32.mul $push40=, $pop37, $pop39
+; NOSIMD-NEXT: call $push41=, __truncsfhf2, $20
+; NOSIMD-NEXT: call $push42=, __extendhfsf2, $pop41
+; NOSIMD-NEXT: f32.add $push43=, $pop40, $pop42
+; NOSIMD-NEXT: call $push44=, __truncsfhf2, $pop43
+; NOSIMD-NEXT: i32.store16 6($0), $pop44
+; NOSIMD-NEXT: call $push45=, __truncsfhf2, $3
+; NOSIMD-NEXT: call $push46=, __extendhfsf2, $pop45
+; NOSIMD-NEXT: call $push47=, __truncsfhf2, $11
+; NOSIMD-NEXT: call $push48=, __extendhfsf2, $pop47
+; NOSIMD-NEXT: f32.mul $push49=, $pop46, $pop48
+; NOSIMD-NEXT: call $push50=, __truncsfhf2, $19
+; NOSIMD-NEXT: call $push51=, __extendhfsf2, $pop50
+; NOSIMD-NEXT: f32.add $push52=, $pop49, $pop51
+; NOSIMD-NEXT: call $push53=, __truncsfhf2, $pop52
+; NOSIMD-NEXT: i32.store16 4($0), $pop53
+; NOSIMD-NEXT: call $push54=, __truncsfhf2, $2
+; NOSIMD-NEXT: call $push55=, __extendhfsf2, $pop54
+; NOSIMD-NEXT: call $push56=, __truncsfhf2, $10
+; NOSIMD-NEXT: call $push57=, __extendhfsf2, $pop56
+; NOSIMD-NEXT: f32.mul $push58=, $pop55, $pop57
+; NOSIMD-NEXT: call $push59=, __truncsfhf2, $18
+; NOSIMD-NEXT: call $push60=, __extendhfsf2, $pop59
+; NOSIMD-NEXT: f32.add $push61=, $pop58, $pop60
+; NOSIMD-NEXT: call $push62=, __truncsfhf2, $pop61
+; NOSIMD-NEXT: i32.store16 2($0), $pop62
+; NOSIMD-NEXT: call $push63=, __truncsfhf2, $1
+; NOSIMD-NEXT: call $push64=, __extendhfsf2, $pop63
+; NOSIMD-NEXT: call $push65=, __truncsfhf2, $9
+; NOSIMD-NEXT: call $push66=, __extendhfsf2, $pop65
+; NOSIMD-NEXT: f32.mul $push67=, $pop64, $pop66
+; NOSIMD-NEXT: call $push68=, __truncsfhf2, $17
+; NOSIMD-NEXT: call $push69=, __extendhfsf2, $pop68
+; NOSIMD-NEXT: f32.add $push70=, $pop67, $pop69
+; NOSIMD-NEXT: call $push71=, __truncsfhf2, $pop70
+; NOSIMD-NEXT: i32.store16 0($0), $pop71
+; NOSIMD-NEXT: return
%mul = fmul contract <8 x half> %b, %a
%add = fadd contract <8 x half> %mul, %c
ret <8 x half> %add
}
-
define <4 x float> @fadd_fmul_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
; RELAXED-LABEL: fadd_fmul_4xf32:
; RELAXED: .functype fadd_fmul_4xf32 (v128, v128, v128) -> (v128)
@@ -76,16 +614,412 @@ define <4 x float> @fadd_fmul_4xf32(<4 x float> %a, <4 x float> %b, <4 x float>
; STRICT-NEXT: f32x4.mul $push0=, $1, $0
; STRICT-NEXT: f32x4.add $push1=, $pop0, $2
; STRICT-NEXT: return $pop1
+;
+; NOFP16-LABEL: fadd_fmul_4xf32:
+; NOFP16: .functype fadd_fmul_4xf32 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: f32x4.mul $push0=, $1, $0
+; NOFP16-NEXT: f32x4.add $push1=, $pop0, $2
+; NOFP16-NEXT: return $pop1
+;
+; NOSIMD-LABEL: fadd_fmul_4xf32:
+; NOSIMD: .functype fadd_fmul_4xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: f32.mul $push0=, $8, $4
+; NOSIMD-NEXT: f32.add $push1=, $pop0, $12
+; NOSIMD-NEXT: f32.store 12($0), $pop1
+; NOSIMD-NEXT: f32.mul $push2=, $7, $3
+; NOSIMD-NEXT: f32.add $push3=, $pop2, $11
+; NOSIMD-NEXT: f32.store 8($0), $pop3
+; NOSIMD-NEXT: f32.mul $push4=, $6, $2
+; NOSIMD-NEXT: f32.add $push5=, $pop4, $10
+; NOSIMD-NEXT: f32.store 4($0), $pop5
+; NOSIMD-NEXT: f32.mul $push6=, $5, $1
+; NOSIMD-NEXT: f32.add $push7=, $pop6, $9
+; NOSIMD-NEXT: f32.store 0($0), $pop7
+; NOSIMD-NEXT: return
%mul = fmul <4 x float> %b, %a
%add = fadd contract <4 x float> %mul, %c
ret <4 x float> %add
}
+define <8 x half> @fmuladd_contract_8xf16(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
+; RELAXED-LABEL: fmuladd_contract_8xf16:
+; RELAXED: .functype fmuladd_contract_8xf16 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT: # %bb.0:
+; RELAXED-NEXT: f16x8.madd $push0=, $0, $1, $2
+; RELAXED-NEXT: return $pop0
+;
+; STRICT-LABEL: fmuladd_contract_8xf16:
+; STRICT: .functype fmuladd_contract_8xf16 (v128, v128, v128) -> (v128)
+; STRICT-NEXT: # %bb.0:
+; STRICT-NEXT: f16x8.madd $push0=, $0, $1, $2
+; STRICT-NEXT: return $pop0
+;
+; NOFP16-LABEL: fmuladd_contract_8xf16:
+; NOFP16: .functype fmuladd_contract_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: call $push0=, __truncsfhf2, $16
+; NOFP16-NEXT: call $push1=, __extendhfsf2, $pop0
+; NOFP16-NEXT: call $push2=, __truncsfhf2, $8
+; NOFP16-NEXT: call $push3=, __extendhfsf2, $pop2
+; NOFP16-NEXT: f32.mul $push4=, $pop1, $pop3
+; NOFP16-NEXT: call $push5=, __truncsfhf2, $24
+; NOFP16-NEXT: call $push6=, __extendhfsf2, $pop5
+; NOFP16-NEXT: f32.add $push7=, $pop4, $pop6
+; NOFP16-NEXT: call $push8=, __truncsfhf2, $pop7
+; NOFP16-NEXT: i32.store16 14($0), $pop8
+; NOFP16-NEXT: call $push9=, __truncsfhf2, $15
+; NOFP16-NEXT: call $push10=, __extendhfsf2, $pop9
+; NOFP16-NEXT: call $push11=, __truncsfhf2, $7
+; NOFP16-NEXT: call $push12=, __extendhfsf2, $pop11
+; NOFP16-NEXT: f32.mul $push13=, $pop10, $pop12
+; NOFP16-NEXT: call $push14=, __truncsfhf2, $23
+; NOFP16-NEXT: call $push15=, __extendhfsf2, $pop14
+; NOFP16-NEXT: f32.add $push16=, $pop13, $pop15
+; NOFP16-NEXT: call $push17=, __truncsfhf2, $pop16
+; NOFP16-NEXT: i32.store16 12($0), $pop17
+; NOFP16-NEXT: call $push18=, __truncsfhf2, $14
+; NOFP16-NEXT: call $push19=, __extendhfsf2, $pop18
+; NOFP16-NEXT: call $push20=, __truncsfhf2, $6
+; NOFP16-NEXT: call $push21=, __extendhfsf2, $pop20
+; NOFP16-NEXT: f32.mul $push22=, $pop19, $pop21
+; NOFP16-NEXT: call $push23=, __truncsfhf2, $22
+; NOFP16-NEXT: call $push24=, __extendhfsf2, $pop23
+; NOFP16-NEXT: f32.add $push25=, $pop22, $pop24
+; NOFP16-NEXT: call $push26=, __truncsfhf2, $pop25
+; NOFP16-NEXT: i32.store16 10($0), $pop26
+; NOFP16-NEXT: call $push27=, __truncsfhf2, $13
+; NOFP16-NEXT: call $push28=, __extendhfsf2, $pop27
+; NOFP16-NEXT: call $push29=, __truncsfhf2, $5
+; NOFP16-NEXT: call $push30=, __extendhfsf2, $pop29
+; NOFP16-NEXT: f32.mul $push31=, $pop28, $pop30
+; NOFP16-NEXT: call $push32=, __truncsfhf2, $21
+; NOFP16-NEXT: call $push33=, __extendhfsf2, $pop32
+; NOFP16-NEXT: f32.add $push34=, $pop31, $pop33
+; NOFP16-NEXT: call $push35=, __truncsfhf2, $pop34
+; NOFP16-NEXT: i32.store16 8($0), $pop35
+; NOFP16-NEXT: call $push36=, __truncsfhf2, $12
+; NOFP16-NEXT: call $push37=, __extendhfsf2, $pop36
+; NOFP16-NEXT: call $push38=, __truncsfhf2, $4
+; NOFP16-NEXT: call $push39=, __extendhfsf2, $pop38
+; NOFP16-NEXT: f32.mul $push40=, $pop37, $pop39
+; NOFP16-NEXT: call $push41=, __truncsfhf2, $20
+; NOFP16-NEXT: call $push42=, __extendhfsf2, $pop41
+; NOFP16-NEXT: f32.add $push43=, $pop40, $pop42
+; NOFP16-NEXT: call $push44=, __truncsfhf2, $pop43
+; NOFP16-NEXT: i32.store16 6($0), $pop44
+; NOFP16-NEXT: call $push45=, __truncsfhf2, $11
+; NOFP16-NEXT: call $push46=, __extendhfsf2, $pop45
+; NOFP16-NEXT: call $push47=, __truncsfhf2, $3
+; NOFP16-NEXT: call $push48=, __extendhfsf2, $pop47
+; NOFP16-NEXT: f32.mul $push49=, $pop46, $pop48
+; NOFP16-NEXT: call $push50=, __truncsfhf2, $19
+; NOFP16-NEXT: call $push51=, __extendhfsf2, $pop50
+; NOFP16-NEXT: f32.add $push52=, $pop49, $pop51
+; NOFP16-NEXT: call $push53=, __truncsfhf2, $pop52
+; NOFP16-NEXT: i32.store16 4($0), $pop53
+; NOFP16-NEXT: call $push54=, __truncsfhf2, $10
+; NOFP16-NEXT: call $push55=, __extendhfsf2, $pop54
+; NOFP16-NEXT: call $push56=, __truncsfhf2, $2
+; NOFP16-NEXT: call $push57=, __extendhfsf2, $pop56
+; NOFP16-NEXT: f32.mul $push58=, $pop55, $pop57
+; NOFP16-NEXT: call $push59=, __truncsfhf2, $18
+; NOFP16-NEXT: call $push60=, __extendhfsf2, $pop59
+; NOFP16-NEXT: f32.add $push61=, $pop58, $pop60
+; NOFP16-NEXT: call $push62=, __truncsfhf2, $pop61
+; NOFP16-NEXT: i32.store16 2($0), $pop62
+; NOFP16-NEXT: call $push63=, __truncsfhf2, $9
+; NOFP16-NEXT: call $push64=, __extendhfsf2, $pop63
+; NOFP16-NEXT: call $push65=, __truncsfhf2, $1
+; NOFP16-NEXT: call $push66=, __extendhfsf2, $pop65
+; NOFP16-NEXT: f32.mul $push67=, $pop64, $pop66
+; NOFP16-NEXT: call $push68=, __truncsfhf2, $17
+; NOFP16-NEXT: call $push69=, __extendhfsf2, $pop68
+; NOFP16-NEXT: f32.add $push70=, $pop67, $pop69
+; NOFP16-NEXT: call $push71=, __truncsfhf2, $pop70
+; NOFP16-NEXT: i32.store16 0($0), $pop71
+; NOFP16-NEXT: return
+;
+; NOSIMD-LABEL: fmuladd_contract_8xf16:
+; NOSIMD: .functype fmuladd_contract_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: call $push0=, __truncsfhf2, $16
+; NOSIMD-NEXT: call $push1=, __extendhfsf2, $pop0
+; NOSIMD-NEXT: call $push2=, __truncsfhf2, $8
+; NOSIMD-NEXT: call $push3=, __extendhfsf2, $pop2
+; NOSIMD-NEXT: f32.mul $push4=, $pop1, $pop3
+; NOSIMD-NEXT: call $push5=, __truncsfhf2, $24
+; NOSIMD-NEXT: call $push6=, __extendhfsf2, $pop5
+; NOSIMD-NEXT: f32.add $push7=, $pop4, $pop6
+; NOSIMD-NEXT: call $push8=, __truncsfhf2, $pop7
+; NOSIMD-NEXT: i32.store16 14($0), $pop8
+; NOSIMD-NEXT: call $push9=, __truncsfhf2, $15
+; NOSIMD-NEXT: call $push10=, __extendhfsf2, $pop9
+; NOSIMD-NEXT: call $push11=, __truncsfhf2, $7
+; NOSIMD-NEXT: call $push12=, __extendhfsf2, $pop11
+; NOSIMD-NEXT: f32.mul $push13=, $pop10, $pop12
+; NOSIMD-NEXT: call $push14=, __truncsfhf2, $23
+; NOSIMD-NEXT: call $push15=, __extendhfsf2, $pop14
+; NOSIMD-NEXT: f32.add $push16=, $pop13, $pop15
+; NOSIMD-NEXT: call $push17=, __truncsfhf2, $pop16
+; NOSIMD-NEXT: i32.store16 12($0), $pop17
+; NOSIMD-NEXT: call $push18=, __truncsfhf2, $14
+; NOSIMD-NEXT: call $push19=, __extendhfsf2, $pop18
+; NOSIMD-NEXT: call $push20=, __truncsfhf2, $6
+; NOSIMD-NEXT: call $push21=, __extendhfsf2, $pop20
+; NOSIMD-NEXT: f32.mul $push22=, $pop19, $pop21
+; NOSIMD-NEXT: call $push23=, __truncsfhf2, $22
+; NOSIMD-NEXT: call $push24=, __extendhfsf2, $pop23
+; NOSIMD-NEXT: f32.add $push25=, $pop22, $pop24
+; NOSIMD-NEXT: call $push26=, __truncsfhf2, $pop25
+; NOSIMD-NEXT: i32.store16 10($0), $pop26
+; NOSIMD-NEXT: call $push27=, __truncsfhf2, $13
+; NOSIMD-NEXT: call $push28=, __extendhfsf2, $pop27
+; NOSIMD-NEXT: call $push29=, __truncsfhf2, $5
+; NOSIMD-NEXT: call $push30=, __extendhfsf2, $pop29
+; NOSIMD-NEXT: f32.mul $push31=, $pop28, $pop30
+; NOSIMD-NEXT: call $push32=, __truncsfhf2, $21
+; NOSIMD-NEXT: call $push33=, __extendhfsf2, $pop32
+; NOSIMD-NEXT: f32.add $push34=, $pop31, $pop33
+; NOSIMD-NEXT: call $push35=, __truncsfhf2, $pop34
+; NOSIMD-NEXT: i32.store16 8($0), $pop35
+; NOSIMD-NEXT: call $push36=, __truncsfhf2, $12
+; NOSIMD-NEXT: call $push37=, __extendhfsf2, $pop36
+; NOSIMD-NEXT: call $push38=, __truncsfhf2, $4
+; NOSIMD-NEXT: call $push39=, __extendhfsf2, $pop38
+; NOSIMD-NEXT: f32.mul $push40=, $pop37, $pop39
+; NOSIMD-NEXT: call $push41=, __truncsfhf2, $20
+; NOSIMD-NEXT: call $push42=, __extendhfsf2, $pop41
+; NOSIMD-NEXT: f32.add $push43=, $pop40, $pop42
+; NOSIMD-NEXT: call $push44=, __truncsfhf2, $pop43
+; NOSIMD-NEXT: i32.store16 6($0), $pop44
+; NOSIMD-NEXT: call $push45=, __truncsfhf2, $11
+; NOSIMD-NEXT: call $push46=, __extendhfsf2, $pop45
+; NOSIMD-NEXT: call $push47=, __truncsfhf2, $3
+; NOSIMD-NEXT: call $push48=, __extendhfsf2, $pop47
+; NOSIMD-NEXT: f32.mul $push49=, $pop46, $pop48
+; NOSIMD-NEXT: call $push50=, __truncsfhf2, $19
+; NOSIMD-NEXT: call $push51=, __extendhfsf2, $pop50
+; NOSIMD-NEXT: f32.add $push52=, $pop49, $pop51
+; NOSIMD-NEXT: call $push53=, __truncsfhf2, $pop52
+; NOSIMD-NEXT: i32.store16 4($0), $pop53
+; NOSIMD-NEXT: call $push54=, __truncsfhf2, $10
+; NOSIMD-NEXT: call $push55=, __extendhfsf2, $pop54
+; NOSIMD-NEXT: call $push56=, __truncsfhf2, $2
+; NOSIMD-NEXT: call $push57=, __extendhfsf2, $pop56
+; NOSIMD-NEXT: f32.mul $push58=, $pop55, $pop57
+; NOSIMD-NEXT: call $push59=, __truncsfhf2, $18
+; NOSIMD-NEXT: call $push60=, __extendhfsf2, $pop59
+; NOSIMD-NEXT: f32.add $push61=, $pop58, $pop60
+; NOSIMD-NEXT: call $push62=, __truncsfhf2, $pop61
+; NOSIMD-NEXT: i32.store16 2($0), $pop62
+; NOSIMD-NEXT: call $push63=, __truncsfhf2, $9
+; NOSIMD-NEXT: call $push64=, __extendhfsf2, $pop63
+; NOSIMD-NEXT: call $push65=, __truncsfhf2, $1
+; NOSIMD-NEXT: call $push66=, __extendhfsf2, $pop65
+; NOSIMD-NEXT: f32.mul $push67=, $pop64, $pop66
+; NOSIMD-NEXT: call $push68=, __truncsfhf2, $17
+; NOSIMD-NEXT: call $push69=, __extendhfsf2, $pop68
+; NOSIMD-NEXT: f32.add $push70=, $pop67, $pop69
+; NOSIMD-NEXT: call $push71=, __truncsfhf2, $pop70
+; NOSIMD-NEXT: i32.store16 0($0), $pop71
+; NOSIMD-NEXT: return
+ %fma = call contract <8 x half> @llvm.fmuladd(<8 x half> %a, <8 x half> %b, <8 x half> %c)
+ ret <8 x half> %fma
+}
+
+define <8 x half> @fmuladd_8xf16(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
+; RELAXED-LABEL: fmuladd_8xf16:
+; RELAXED: .functype fmuladd_8xf16 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT: # %bb.0:
+; RELAXED-NEXT: f16x8.madd $push0=, $0, $1, $2
+; RELAXED-NEXT: return $pop0
+;
+; STRICT-LABEL: fmuladd_8xf16:
+; STRICT: .functype fmuladd_8xf16 (v128, v128, v128) -> (v128)
+; STRICT-NEXT: # %bb.0:
+; STRICT-NEXT: f16x8.madd $push0=, $0, $1, $2
+; STRICT-NEXT: return $pop0
+;
+; NOFP16-LABEL: fmuladd_8xf16:
+; NOFP16: .functype fmuladd_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: call $push0=, __truncsfhf2, $16
+; NOFP16-NEXT: call $push1=, __extendhfsf2, $pop0
+; NOFP16-NEXT: call $push2=, __truncsfhf2, $8
+; NOFP16-NEXT: call $push3=, __extendhfsf2, $pop2
+; NOFP16-NEXT: f32.mul $push4=, $pop1, $pop3
+; NOFP16-NEXT: call $push5=, __truncsfhf2, $24
+; NOFP16-NEXT: call $push6=, __extendhfsf2, $pop5
+; NOFP16-NEXT: f32.add $push7=, $pop4, $pop6
+; NOFP16-NEXT: call $push8=, __truncsfhf2, $pop7
+; NOFP16-NEXT: i32.store16 14($0), $pop8
+; NOFP16-NEXT: call $push9=, __truncsfhf2, $15
+; NOFP16-NEXT: call $push10=, __extendhfsf2, $pop9
+; NOFP16-NEXT: call $push11=, __truncsfhf2, $7
+; NOFP16-NEXT: call $push12=, __extendhfsf2, $pop11
+; NOFP16-NEXT: f32.mul $push13=, $pop10, $pop12
+; NOFP16-NEXT: call $push14=, __truncsfhf2, $23
+; NOFP16-NEXT: call $push15=, __extendhfsf2, $pop14
+; NOFP16-NEXT: f32.add $push16=, $pop13, $pop15
+; NOFP16-NEXT: call $push17=, __truncsfhf2, $pop16
+; NOFP16-NEXT: i32.store16 12($0), $pop17
+; NOFP16-NEXT: call $push18=, __truncsfhf2, $14
+; NOFP16-NEXT: call $push19=, __extendhfsf2, $pop18
+; NOFP16-NEXT: call $push20=, __truncsfhf2, $6
+; NOFP16-NEXT: call $push21=, __extendhfsf2, $pop20
+; NOFP16-NEXT: f32.mul $push22=, $pop19, $pop21
+; NOFP16-NEXT: call $push23=, __truncsfhf2, $22
+; NOFP16-NEXT: call $push24=, __extendhfsf2, $pop23
+; NOFP16-NEXT: f32.add $push25=, $pop22, $pop24
+; NOFP16-NEXT: call $push26=, __truncsfhf2, $pop25
+; NOFP16-NEXT: i32.store16 10($0), $pop26
+; NOFP16-NEXT: call $push27=, __truncsfhf2, $13
+; NOFP16-NEXT: call $push28=, __extendhfsf2, $pop27
+; NOFP16-NEXT: call $push29=, __truncsfhf2, $5
+; NOFP16-NEXT: call $push30=, __extendhfsf2, $pop29
+; NOFP16-NEXT: f32.mul $push31=, $pop28, $pop30
+; NOFP16-NEXT: call $push32=, __truncsfhf2, $21
+; NOFP16-NEXT: call $push33=, __extendhfsf2, $pop32
+; NOFP16-NEXT: f32.add $push34=, $pop31, $pop33
+; NOFP16-NEXT: call $push35=, __truncsfhf2, $pop34
+; NOFP16-NEXT: i32.store16 8($0), $pop35
+; NOFP16-NEXT: call $push36=, __truncsfhf2, $12
+; NOFP16-NEXT: call $push37=, __extendhfsf2, $pop36
+; NOFP16-NEXT: call $push38=, __truncsfhf2, $4
+; NOFP16-NEXT: call $push39=, __extendhfsf2, $pop38
+; NOFP16-NEXT: f32.mul $push40=, $pop37, $pop39
+; NOFP16-NEXT: call $push41=, __truncsfhf2, $20
+; NOFP16-NEXT: call $push42=, __extendhfsf2, $pop41
+; NOFP16-NEXT: f32.add $push43=, $pop40, $pop42
+; NOFP16-NEXT: call $push44=, __truncsfhf2, $pop43
+; NOFP16-NEXT: i32.store16 6($0), $pop44
+; NOFP16-NEXT: call $push45=, __truncsfhf2, $11
+; NOFP16-NEXT: call $push46=, __extendhfsf2, $pop45
+; NOFP16-NEXT: call $push47=, __truncsfhf2, $3
+; NOFP16-NEXT: call $push48=, __extendhfsf2, $pop47
+; NOFP16-NEXT: f32.mul $push49=, $pop46, $pop48
+; NOFP16-NEXT: call $push50=, __truncsfhf2, $19
+; NOFP16-NEXT: call $push51=, __extendhfsf2, $pop50
+; NOFP16-NEXT: f32.add $push52=, $pop49, $pop51
+; NOFP16-NEXT: call $push53=, __truncsfhf2, $pop52
+; NOFP16-NEXT: i32.store16 4($0), $pop53
+; NOFP16-NEXT: call $push54=, __truncsfhf2, $10
+; NOFP16-NEXT: call $push55=, __extendhfsf2, $pop54
+; NOFP16-NEXT: call $push56=, __truncsfhf2, $2
+; NOFP16-NEXT: call $push57=, __extendhfsf2, $pop56
+; NOFP16-NEXT: f32.mul $push58=, $pop55, $pop57
+; NOFP16-NEXT: call $push59=, __truncsfhf2, $18
+; NOFP16-NEXT: call $push60=, __extendhfsf2, $pop59
+; NOFP16-NEXT: f32.add $push61=, $pop58, $pop60
+; NOFP16-NEXT: call $push62=, __truncsfhf2, $pop61
+; NOFP16-NEXT: i32.store16 2($0), $pop62
+; NOFP16-NEXT: call $push63=, __truncsfhf2, $9
+; NOFP16-NEXT: call $push64=, __extendhfsf2, $pop63
+; NOFP16-NEXT: call $push65=, __truncsfhf2, $1
+; NOFP16-NEXT: call $push66=, __extendhfsf2, $pop65
+; NOFP16-NEXT: f32.mul $push67=, $pop64, $pop66
+; NOFP16-NEXT: call $push68=, __truncsfhf2, $17
+; NOFP16-NEXT: call $push69=, __extendhfsf2, $pop68
+; NOFP16-NEXT: f32.add $push70=, $pop67, $pop69
+; NOFP16-NEXT: call $push71=, __truncsfhf2, $pop70
+; NOFP16-NEXT: i32.store16 0($0), $pop71
+; NOFP16-NEXT: return
+;
+; NOSIMD-LABEL: fmuladd_8xf16:
+; NOSIMD: .functype fmuladd_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: call $push0=, __truncsfhf2, $16
+; NOSIMD-NEXT: call $push1=, __extendhfsf2, $pop0
+; NOSIMD-NEXT: call $push2=, __truncsfhf2, $8
+; NOSIMD-NEXT: call $push3=, __extendhfsf2, $pop2
+; NOSIMD-NEXT: f32.mul $push4=, $pop1, $pop3
+; NOSIMD-NEXT: call $push5=, __truncsfhf2, $24
+; NOSIMD-NEXT: call $push6=, __extendhfsf2, $pop5
+; NOSIMD-NEXT: f32.add $push7=, $pop4, $pop6
+; NOSIMD-NEXT: call $push8=, __truncsfhf2, $pop7
+; NOSIMD-NEXT: i32.store16 14($0), $pop8
+; NOSIMD-NEXT: call $push9=, __truncsfhf2, $15
+; NOSIMD-NEXT: call $push10=, __extendhfsf2, $pop9
+; NOSIMD-NEXT: call $push11=, __truncsfhf2, $7
+; NOSIMD-NEXT: call $push12=, __extendhfsf2, $pop11
+; NOSIMD-NEXT: f32.mul $push13=, $pop10, $pop12
+; NOSIMD-NEXT: call $push14=, __truncsfhf2, $23
+; NOSIMD-NEXT: call $push15=, __extendhfsf2, $pop14
+; NOSIMD-NEXT: f32.add $push16=, $pop13, $pop15
+; NOSIMD-NEXT: call $push17=, __truncsfhf2, $pop16
+; NOSIMD-NEXT: i32.store16 12($0), $pop17
+; NOSIMD-NEXT: call $push18=, __truncsfhf2, $14
+; NOSIMD-NEXT: call $push19=, __extendhfsf2, $pop18
+; NOSIMD-NEXT: call $push20=, __truncsfhf2, $6
+; NOSIMD-NEXT: call $push21=, __extendhfsf2, $pop20
+; NOSIMD-NEXT: f32.mul $push22=, $pop19, $pop21
+; NOSIMD-NEXT: call $push23=, __truncsfhf2, $22
+; NOSIMD-NEXT: call $push24=, __extendhfsf2, $pop23
+; NOSIMD-NEXT: f32.add $push25=, $pop22, $pop24
+; NOSIMD-NEXT: call $push26=, __truncsfhf2, $pop25
+; NOSIMD-NEXT: i32.store16 10($0), $pop26
+; NOSIMD-NEXT: call $push27=, __truncsfhf2, $13
+; NOSIMD-NEXT: call $push28=, __extendhfsf2, $pop27
+; NOSIMD-NEXT: call $push29=, __truncsfhf2, $5
+; NOSIMD-NEXT: call $push30=, __extendhfsf2, $pop29
+; NOSIMD-NEXT: f32.mul $push31=, $pop28, $pop30
+; NOSIMD-NEXT: call $push32=, __truncsfhf2, $21
+; NOSIMD-NEXT: call $push33=, __extendhfsf2, $pop32
+; NOSIMD-NEXT: f32.add $push34=, $pop31, $pop33
+; NOSIMD-NEXT: call $push35=, __truncsfhf2, $pop34
+; NOSIMD-NEXT: i32.store16 8($0), $pop35
+; NOSIMD-NEXT: call $push36=, __truncsfhf2, $12
+; NOSIMD-NEXT: call $push37=, __extendhfsf2, $pop36
+; NOSIMD-NEXT: call $push38=, __truncsfhf2, $4
+; NOSIMD-NEXT: call $push39=, __extendhfsf2, $pop38
+; NOSIMD-NEXT: f32.mul $push40=, $pop37, $pop39
+; NOSIMD-NEXT: call $push41=, __truncsfhf2, $20
+; NOSIMD-NEXT: call $push42=, __extendhfsf2, $pop41
+; NOSIMD-NEXT: f32.add $push43=, $pop40, $pop42
+; NOSIMD-NEXT: call $push44=, __truncsfhf2, $pop43
+; NOSIMD-NEXT: i32.store16 6($0), $pop44
+; NOSIMD-NEXT: call $push45=, __truncsfhf2, $11
+; NOSIMD-NEXT: call $push46=, __extendhfsf2, $pop45
+; NOSIMD-NEXT: call $push47=, __truncsfhf2, $3
+; NOSIMD-NEXT: call $push48=, __extendhfsf2, $pop47
+; NOSIMD-NEXT: f32.mul $push49=, $pop46, $pop48
+; NOSIMD-NEXT: call $push50=, __truncsfhf2, $19
+; NOSIMD-NEXT: call $push51=, __extendhfsf2, $pop50
+; NOSIMD-NEXT: f32.add $push52=, $pop49, $pop51
+; NOSIMD-NEXT: call $push53=, __truncsfhf2, $pop52
+; NOSIMD-NEXT: i32.store16 4($0), $pop53
+; NOSIMD-NEXT: call $push54=, __truncsfhf2, $10
+; NOSIMD-NEXT: call $push55=, __extendhfsf2, $pop54
+; NOSIMD-NEXT: call $push56=, __truncsfhf2, $2
+; NOSIMD-NEXT: call $push57=, __extendhfsf2, $pop56
+; NOSIMD-NEXT: f32.mul $push58=, $pop55, $pop57
+; NOSIMD-NEXT: call $push59=, __truncsfhf2, $18
+; NOSIMD-NEXT: call $push60=, __extendhfsf2, $pop59
+; NOSIMD-NEXT: f32.add $push61=, $pop58, $pop60
+; NOSIMD-NEXT: call $push62=, __truncsfhf2, $pop61
+; NOSIMD-NEXT: i32.store16 2($0), $pop62
+; NOSIMD-NEXT: call $push63=, __truncsfhf2, $9
+; NOSIMD-NEXT: call $push64=, __extendhfsf2, $pop63
+; NOSIMD-NEXT: call $push65=, __truncsfhf2, $1
+; NOSIMD-NEXT: call $push66=, __extendhfsf2, $pop65
+; NOSIMD-NEXT: f32.mul $push67=, $pop64, $pop66
+; NOSIMD-NEXT: call $push68=, __truncsfhf2, $17
+; NOSIMD-NEXT: call $push69=, __extendhfsf2, $pop68
+; NOSIMD-NEXT: f32.add $push70=, $pop67, $pop69
+; NOSIMD-NEXT: call $push71=, __truncsfhf2, $pop70
+; NOSIMD-NEXT: i32.store16 0($0), $pop71
+; NOSIMD-NEXT: return
+ %fma = call <8 x half> @llvm.fmuladd(<8 x half> %a, <8 x half> %b, <8 x half> %c)
+ ret <8 x half> %fma
+}
+
define <4 x float> @fmuladd_contract_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
; RELAXED-LABEL: fmuladd_contract_4xf32:
; RELAXED: .functype fmuladd_contract_4xf32 (v128, v128, v128) -> (v128)
; RELAXED-NEXT: # %bb.0:
-; RELAXED-NEXT: f32x4.relaxed_madd $push0=, $2, $0, $1
+; RELAXED-NEXT: f32x4.relaxed_madd $push0=, $0, $1, $2
; RELAXED-NEXT: return $pop0
;
; STRICT-LABEL: fmuladd_contract_4xf32:
@@ -94,18 +1028,40 @@ define <4 x float> @fmuladd_contract_4xf32(<4 x float> %a, <4 x float> %b, <4 x
; STRICT-NEXT: f32x4.mul $push0=, $0, $1
; STRICT-NEXT: f32x4.add $push1=, $pop0, $2
; STRICT-NEXT: return $pop1
+;
+; NOFP16-LABEL: fmuladd_contract_4xf32:
+; NOFP16: .functype fmuladd_contract_4xf32 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: f32x4.mul $push0=, $0, $1
+; NOFP16-NEXT: f32x4.add $push1=, $pop0, $2
+; NOFP16-NEXT: return $pop1
+;
+; NOSIMD-LABEL: fmuladd_contract_4xf32:
+; NOSIMD: .functype fmuladd_contract_4xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: f32.mul $push0=, $4, $8
+; NOSIMD-NEXT: f32.add $push1=, $pop0, $12
+; NOSIMD-NEXT: f32.store 12($0), $pop1
+; NOSIMD-NEXT: f32.mul $push2=, $3, $7
+; NOSIMD-NEXT: f32.add $push3=, $pop2, $11
+; NOSIMD-NEXT: f32.store 8($0), $pop3
+; NOSIMD-NEXT: f32.mul $push4=, $2, $6
+; NOSIMD-NEXT: f32.add $push5=, $pop4, $10
+; NOSIMD-NEXT: f32.store 4($0), $pop5
+; NOSIMD-NEXT: f32.mul $push6=, $1, $5
+; NOSIMD-NEXT: f32.add $push7=, $pop6, $9
+; NOSIMD-NEXT: f32.store 0($0), $pop7
+; NOSIMD-NEXT: return
%fma = call contract <4 x float> @llvm.fmuladd(<4 x float> %a, <4 x float> %b, <4 x float> %c)
ret <4 x float> %fma
}
-; TODO: This should also have relaxed_madd in RELAXED case
define <4 x float> @fmuladd_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
; RELAXED-LABEL: fmuladd_4xf32:
; RELAXED: .functype fmuladd_4xf32 (v128, v128, v128) -> (v128)
; RELAXED-NEXT: # %bb.0:
-; RELAXED-NEXT: f32x4.mul $push0=, $0, $1
-; RELAXED-NEXT: f32x4.add $push1=, $pop0, $2
-; RELAXED-NEXT: return $pop1
+; RELAXED-NEXT: f32x4.relaxed_madd $push0=, $0, $1, $2
+; RELAXED-NEXT: return $pop0
;
; STRICT-LABEL: fmuladd_4xf32:
; STRICT: .functype fmuladd_4xf32 (v128, v128, v128) -> (v128)
@@ -113,10 +1069,170 @@ define <4 x float> @fmuladd_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c
; STRICT-NEXT: f32x4.mul $push0=, $0, $1
; STRICT-NEXT: f32x4.add $push1=, $pop0, $2
; STRICT-NEXT: return $pop1
+;
+; NOFP16-LABEL: fmuladd_4xf32:
+; NOFP16: .functype fmuladd_4xf32 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: f32x4.mul $push0=, $0, $1
+; NOFP16-NEXT: f32x4.add $push1=, $pop0, $2
+; NOFP16-NEXT: return $pop1
+;
+; NOSIMD-LABEL: fmuladd_4xf32:
+; NOSIMD: .functype fmuladd_4xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: f32.mul $push0=, $4, $8
+; NOSIMD-NEXT: f32.add $push1=, $pop0, $12
+; NOSIMD-NEXT: f32.store 12($0), $pop1
+; NOSIMD-NEXT: f32.mul $push2=, $3, $7
+; NOSIMD-NEXT: f32.add $push3=, $pop2, $11
+; NOSIMD-NEXT: f32.store 8($0), $pop3
+; NOSIMD-NEXT: f32.mul $push4=, $2, $6
+; NOSIMD-NEXT: f32.add $push5=, $pop4, $10
+; NOSIMD-NEXT: f32.store 4($0), $pop5
+; NOSIMD-NEXT: f32.mul $push6=, $1, $5
+; NOSIMD-NEXT: f32.add $push7=, $pop6, $9
+; NOSIMD-NEXT: f32.store 0($0), $pop7
+; NOSIMD-NEXT: return
%fma = call <4 x float> @llvm.fmuladd(<4 x float> %a, <4 x float> %b, <4 x float> %c)
ret <4 x float> %fma
}
+define <8 x float> @fmuladd_8xf32(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
+; RELAXED-LABEL: fmuladd_8xf32:
+; RELAXED: .functype fmuladd_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
+; RELAXED-NEXT: # %bb.0:
+; RELAXED-NEXT: f32x4.mul $push0=, $2, $4
+; RELAXED-NEXT: f32x4.add $push1=, $pop0, $6
+; RELAXED-NEXT: v128.store 16($0), $pop1
+; RELAXED-NEXT: f32x4.mul $push2=, $1, $3
+; RELAXED-NEXT: f32x4.add $push3=, $pop2, $5
+; RELAXED-NEXT: v128.store 0($0), $pop3
+; RELAXED-NEXT: return
+;
+; STRICT-LABEL: fmuladd_8xf32:
+; STRICT: .functype fmuladd_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
+; STRICT-NEXT: # %bb.0:
+; STRICT-NEXT: f32x4.mul $push0=, $2, $4
+; STRICT-NEXT: f32x4.add $push1=, $pop0, $6
+; STRICT-NEXT: v128.store 16($0), $pop1
+; STRICT-NEXT: f32x4.mul $push2=, $1, $3
+; STRICT-NEXT: f32x4.add $push3=, $pop2, $5
+; STRICT-NEXT: v128.store 0($0), $pop3
+; STRICT-NEXT: return
+;
+; NOFP16-LABEL: fmuladd_8xf32:
+; NOFP16: .functype fmuladd_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: f32x4.mul $push0=, $2, $4
+; NOFP16-NEXT: f32x4.add $push1=, $pop0, $6
+; NOFP16-NEXT: v128.store 16($0), $pop1
+; NOFP16-NEXT: f32x4.mul $push2=, $1, $3
+; NOFP16-NEXT: f32x4.add $push3=, $pop2, $5
+; NOFP16-NEXT: v128.store 0($0), $pop3
+; NOFP16-NEXT: return
+;
+; NOSIMD-LABEL: fmuladd_8xf32:
+; NOSIMD: .functype fmuladd_8xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: f32.mul $push0=, $8, $16
+; NOSIMD-NEXT: f32.add $push1=, $pop0, $24
+; NOSIMD-NEXT: f32.store 28($0), $pop1
+; NOSIMD-NEXT: f32.mul $push2=, $7, $15
+; NOSIMD-NEXT: f32.add $push3=, $pop2, $23
+; NOSIMD-NEXT: f32.store 24($0), $pop3
+; NOSIMD-NEXT: f32.mul $push4=, $6, $14
+; NOSIMD-NEXT: f32.add $push5=, $pop4, $22
+; NOSIMD-NEXT: f32.store 20($0), $pop5
+; NOSIMD-NEXT: f32.mul $push6=, $5, $13
+; NOSIMD-NEXT: f32.add $push7=, $pop6, $21
+; NOSIMD-NEXT: f32.store 16($0), $pop7
+; NOSIMD-NEXT: f32.mul $push8=, $4, $12
+; NOSIMD-NEXT: f32.add $push9=, $pop8, $20
+; NOSIMD-NEXT: f32.store 12($0), $pop9
+; NOSIMD-NEXT: f32.mul $push10=, $3, $11
+; NOSIMD-NEXT: f32.add $push11=, $pop10, $19
+; NOSIMD-NEXT: f32.store 8($0), $pop11
+; NOSIMD-NEXT: f32.mul $push12=, $2, $10
+; NOSIMD-NEXT: f32.add $push13=, $pop12, $18
+; NOSIMD-NEXT: f32.store 4($0), $pop13
+; NOSIMD-NEXT: f32.mul $push14=, $1, $9
+; NOSIMD-NEXT: f32.add $push15=, $pop14, $17
+; NOSIMD-NEXT: f32.store 0($0), $pop15
+; NOSIMD-NEXT: return
+ %fma = call <8 x float> @llvm.fmuladd(<8 x float> %a, <8 x float> %b, <8 x float> %c)
+ ret <8 x float> %fma
+}
+
+define <2 x double> @fmuladd_contract_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; RELAXED-LABEL: fmuladd_contract_2xf64:
+; RELAXED: .functype fmuladd_contract_2xf64 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT: # %bb.0:
+; RELAXED-NEXT: f64x2.relaxed_madd $push0=, $0, $1, $2
+; RELAXED-NEXT: return $pop0
+;
+; STRICT-LABEL: fmuladd_contract_2xf64:
+; STRICT: .functype fmuladd_contract_2xf64 (v128, v128, v128) -> (v128)
+; STRICT-NEXT: # %bb.0:
+; STRICT-NEXT: f64x2.mul $push0=, $0, $1
+; STRICT-NEXT: f64x2.add $push1=, $pop0, $2
+; STRICT-NEXT: return $pop1
+;
+; NOFP16-LABEL: fmuladd_contract_2xf64:
+; NOFP16: .functype fmuladd_contract_2xf64 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: f64x2.mul $push0=, $0, $1
+; NOFP16-NEXT: f64x2.add $push1=, $pop0, $2
+; NOFP16-NEXT: return $pop1
+;
+; NOSIMD-LABEL: fmuladd_contract_2xf64:
+; NOSIMD: .functype fmuladd_contract_2xf64 (i32, f64, f64, f64, f64, f64, f64) -> ()
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: f64.mul $push0=, $2, $4
+; NOSIMD-NEXT: f64.add $push1=, $pop0, $6
+; NOSIMD-NEXT: f64.store 8($0), $pop1
+; NOSIMD-NEXT: f64.mul $push2=, $1, $3
+; NOSIMD-NEXT: f64.add $push3=, $pop2, $5
+; NOSIMD-NEXT: f64.store 0($0), $pop3
+; NOSIMD-NEXT: return
+ %fma = call contract <2 x double> @llvm.fmuladd(<2 x double> %a, <2 x double> %b, <2 x double> %c)
+ ret <2 x double> %fma
+}
+
+define <2 x double> @fmuladd_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; RELAXED-LABEL: fmuladd_2xf64:
+; RELAXED: .functype fmuladd_2xf64 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT: # %bb.0:
+; RELAXED-NEXT: f64x2.relaxed_madd $push0=, $0, $1, $2
+; RELAXED-NEXT: return $pop0
+;
+; STRICT-LABEL: fmuladd_2xf64:
+; STRICT: .functype fmuladd_2xf64 (v128, v128, v128) -> (v128)
+; STRICT-NEXT: # %bb.0:
+; STRICT-NEXT: f64x2.mul $push0=, $0, $1
+; STRICT-NEXT: f64x2.add $push1=, $pop0, $2
+; STRICT-NEXT: return $pop1
+;
+; NOFP16-LABEL: fmuladd_2xf64:
+; NOFP16: .functype fmuladd_2xf64 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: f64x2.mul $push0=, $0, $1
+; NOFP16-NEXT: f64x2.add $push1=, $pop0, $2
+; NOFP16-NEXT: return $pop1
+;
+; NOSIMD-LABEL: fmuladd_2xf64:
+; NOSIMD: .functype fmuladd_2xf64 (i32, f64, f64, f64, f64, f64, f64) -> ()
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: f64.mul $push0=, $2, $4
+; NOSIMD-NEXT: f64.add $push1=, $pop0, $6
+; NOSIMD-NEXT: f64.store 8($0), $pop1
+; NOSIMD-NEXT: f64.mul $push2=, $1, $3
+; NOSIMD-NEXT: f64.add $push3=, $pop2, $5
+; NOSIMD-NEXT: f64.store 0($0), $pop3
+; NOSIMD-NEXT: return
+ %fma = call <2 x double> @llvm.fmuladd(<2 x double> %a, <2 x double> %b, <2 x double> %c)
+ ret <2 x double> %fma
+}
+
define <4 x float> @fma_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
; RELAXED-LABEL: fma_4xf32:
; RELAXED: .functype fma_4xf32 (v128, v128, v128) -> (v128)
@@ -167,6 +1283,44 @@ define <4 x float> @fma_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
; STRICT-NEXT: call $push18=, fmaf, $pop17, $pop16, $pop15
; STRICT-NEXT: f32x4.replace_lane $push19=, $pop14, 3, $pop18
; STRICT-NEXT: return $pop19
+;
+; NOFP16-LABEL: fma_4xf32:
+; NOFP16: .functype fma_4xf32 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: f32x4.extract_lane $push2=, $0, 0
+; NOFP16-NEXT: f32x4.extract_lane $push1=, $1, 0
+; NOFP16-NEXT: f32x4.extract_lane $push0=, $2, 0
+; NOFP16-NEXT: call $push3=, fmaf, $pop2, $pop1, $pop0
+; NOFP16-NEXT: f32x4.splat $push4=, $pop3
+; NOFP16-NEXT: f32x4.extract_lane $push7=, $0, 1
+; NOFP16-NEXT: f32x4.extract_lane $push6=, $1, 1
+; NOFP16-NEXT: f32x4.extract_lane $push5=, $2, 1
+; NOFP16-NEXT: call $push8=, fmaf, $pop7, $pop6, $pop5
+; NOFP16-NEXT: f32x4.replace_lane $push9=, $pop4, 1, $pop8
+; NOFP16-NEXT: f32x4.extract_lane $push12=, $0, 2
+; NOFP16-NEXT: f32x4.extract_lane $push11=, $1, 2
+; NOFP16-NEXT: f32x4.extract_lane $push10=, $2, 2
+; NOFP16-NEXT: call $push13=, fmaf, $pop12, $pop11, $pop10
+; NOFP16-NEXT: f32x4.replace_lane $push14=, $pop9, 2, $pop13
+; NOFP16-NEXT: f32x4.extract_lane $push17=, $0, 3
+; NOFP16-NEXT: f32x4.extract_lane $push16=, $1, 3
+; NOFP16-NEXT: f32x4.extract_lane $push15=, $2, 3
+; NOFP16-NEXT: call $push18=, fmaf, $pop17, $pop16, $pop15
+; NOFP16-NEXT: f32x4.replace_lane $push19=, $pop14, 3, $pop18
+; NOFP16-NEXT: return $pop19
+;
+; NOSIMD-LABEL: fma_4xf32:
+; NOSIMD: .functype fma_4xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: call $push0=, fmaf, $4, $8, $12
+; NOSIMD-NEXT: f32.store 12($0), $pop0
+; NOSIMD-NEXT: call $push1=, fmaf, $3, $7, $11
+; NOSIMD-NEXT: f32.store 8($0), $pop1
+; NOSIMD-NEXT: call $push2=, fmaf, $2, $6, $10
+; NOSIMD-NEXT: f32.store 4($0), $pop2
+; NOSIMD-NEXT: call $push3=, fmaf, $1, $5, $9
+; NOSIMD-NEXT: f32.store 0($0), $pop3
+; NOSIMD-NEXT: return
%fma = call <4 x float> @llvm.fma(<4 x float> %a, <4 x float> %b, <4 x float> %c)
ret <4 x float> %fma
}
@@ -176,9 +1330,9 @@ define <8 x float> @fadd_fmul_contract_8xf32(<8 x float> %a, <8 x float> %b, <8
; RELAXED-LABEL: fadd_fmul_contract_8xf32:
; RELAXED: .functype fadd_fmul_contract_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
; RELAXED-NEXT: # %bb.0:
-; RELAXED-NEXT: f32x4.relaxed_madd $push0=, $6, $4, $2
+; RELAXED-NEXT: f32x4.relaxed_madd $push0=, $4, $2, $6
; RELAXED-NEXT: v128.store 16($0), $pop0
-; RELAXED-NEXT: f32x4.relaxed_madd $push1=, $5, $3, $1
+; RELAXED-NEXT: f32x4.relaxed_madd $push1=, $3, $1, $5
; RELAXED-NEXT: v128.store 0($0), $pop1
; RELAXED-NEXT: return
;
@@ -192,17 +1346,56 @@ define <8 x float> @fadd_fmul_contract_8xf32(<8 x float> %a, <8 x float> %b, <8
; STRICT-NEXT: f32x4.add $push3=, $pop2, $5
; STRICT-NEXT: v128.store 0($0), $pop3
; STRICT-NEXT: return
+;
+; NOFP16-LABEL: fadd_fmul_contract_8xf32:
+; NOFP16: .functype fadd_fmul_contract_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: f32x4.mul $push0=, $4, $2
+; NOFP16-NEXT: f32x4.add $push1=, $pop0, $6
+; NOFP16-NEXT: v128.store 16($0), $pop1
+; NOFP16-NEXT: f32x4.mul $push2=, $3, $1
+; NOFP16-NEXT: f32x4.add $push3=, $pop2, $5
+; NOFP16-NEXT: v128.store 0($0), $pop3
+; NOFP16-NEXT: return
+;
+; NOSIMD-LABEL: fadd_fmul_contract_8xf32:
+; NOSIMD: .functype fadd_fmul_contract_8xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: f32.mul $push0=, $16, $8
+; NOSIMD-NEXT: f32.add $push1=, $pop0, $24
+; NOSIMD-NEXT: f32.store 28($0), $pop1
+; NOSIMD-NEXT: f32.mul $push2=, $15, $7
+; NOSIMD-NEXT: f32.add $push3=, $pop2, $23
+; NOSIMD-NEXT: f32.store 24($0), $pop3
+; NOSIMD-NEXT: f32.mul $push4=, $14, $6
+; NOSIMD-NEXT: f32.add $push5=, $pop4, $22
+; NOSIMD-NEXT: f32.store 20($0), $pop5
+; NOSIMD-NEXT: f32.mul $push6=, $13, $5
+; NOSIMD-NEXT: f32.add $push7=, $pop6, $21
+; NOSIMD-NEXT: f32.store 16($0), $pop7
+; NOSIMD-NEXT: f32.mul $push8=, $12, $4
+; NOSIMD-NEXT: f32.add $push9=, $pop8, $20
+; NOSIMD-NEXT: f32.store 12($0), $pop9
+; NOSIMD-NEXT: f32.mul $push10=, $11, $3
+; NOSIMD-NEXT: f32.add $push11=, $pop10, $19
+; NOSIMD-NEXT: f32.store 8($0), $pop11
+; NOSIMD-NEXT: f32.mul $push12=, $10, $2
+; NOSIMD-NEXT: f32.add $push13=, $pop12, $18
+; NOSIMD-NEXT: f32.store 4($0), $pop13
+; NOSIMD-NEXT: f32.mul $push14=, $9, $1
+; NOSIMD-NEXT: f32.add $push15=, $pop14, $17
+; NOSIMD-NEXT: f32.store 0($0), $pop15
+; NOSIMD-NEXT: return
%mul = fmul contract <8 x float> %b, %a
%add = fadd contract <8 x float> %mul, %c
ret <8 x float> %add
}
-
define <2 x double> @fadd_fmul_contract_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
; RELAXED-LABEL: fadd_fmul_contract_2xf64:
; RELAXED: .functype fadd_fmul_contract_2xf64 (v128, v128, v128) -> (v128)
; RELAXED-NEXT: # %bb.0:
-; RELAXED-NEXT: f64x2.relaxed_madd $push0=, $2, $1, $0
+; RELAXED-NEXT: f64x2.relaxed_madd $push0=, $1, $0, $2
; RELAXED-NEXT: return $pop0
;
; STRICT-LABEL: fadd_fmul_contract_2xf64:
@@ -211,28 +1404,64 @@ define <2 x double> @fadd_fmul_contract_2xf64(<2 x double> %a, <2 x double> %b,
; STRICT-NEXT: f64x2.mul $push0=, $1, $0
; STRICT-NEXT: f64x2.add $push1=, $pop0, $2
; STRICT-NEXT: return $pop1
+;
+; NOFP16-LABEL: fadd_fmul_contract_2xf64:
+; NOFP16: .functype fadd_fmul_contract_2xf64 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: f64x2.mul $push0=, $1, $0
+; NOFP16-NEXT: f64x2.add $push1=, $pop0, $2
+; NOFP16-NEXT: return $pop1
+;
+; NOSIMD-LABEL: fadd_fmul_contract_2xf64:
+; NOSIMD: .functype fadd_fmul_contract_2xf64 (i32, f64, f64, f64, f64, f64, f64) -> ()
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: f64.mul $push0=, $4, $2
+; NOSIMD-NEXT: f64.add $push1=, $pop0, $6
+; NOSIMD-NEXT: f64.store 8($0), $pop1
+; NOSIMD-NEXT: f64.mul $push2=, $3, $1
+; NOSIMD-NEXT: f64.add $push3=, $pop2, $5
+; NOSIMD-NEXT: f64.store 0($0), $pop3
+; NOSIMD-NEXT: return
%mul = fmul contract <2 x double> %b, %a
%add = fadd contract <2 x double> %mul, %c
ret <2 x double> %add
}
-define float @fadd_fmul_contract_f32(float %a, float %b, float %c) {
-; RELAXED-LABEL: fadd_fmul_contract_f32:
-; RELAXED: .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32)
+define <2 x double> @fadd_fmul_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; RELAXED-LABEL: fadd_fmul_2xf64:
+; RELAXED: .functype fadd_fmul_2xf64 (v128, v128, v128) -> (v128)
; RELAXED-NEXT: # %bb.0:
-; RELAXED-NEXT: f32.mul $push0=, $1, $0
-; RELAXED-NEXT: f32.add $push1=, $pop0, $2
+; RELAXED-NEXT: f64x2.mul $push0=, $1, $0
+; RELAXED-NEXT: f64x2.add $push1=, $pop0, $2
; RELAXED-NEXT: return $pop1
;
-; STRICT-LABEL: fadd_fmul_contract_f32:
-; STRICT: .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32)
+; STRICT-LABEL: fadd_fmul_2xf64:
+; STRICT: .functype fadd_fmul_2xf64 (v128, v128, v128) -> (v128)
; STRICT-NEXT: # %bb.0:
-; STRICT-NEXT: f32.mul $push0=, $1, $0
-; STRICT-NEXT: f32.add $push1=, $pop0, $2
+; STRICT-NEXT: f64x2.mul $push0=, $1, $0
+; STRICT-NEXT: f64x2.add $push1=, $pop0, $2
; STRICT-NEXT: return $pop1
- %mul = fmul contract float %b, %a
- %add = fadd contract float %mul, %c
- ret float %add
+;
+; NOFP16-LABEL: fadd_fmul_2xf64:
+; NOFP16: .functype fadd_fmul_2xf64 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: f64x2.mul $push0=, $1, $0
+; NOFP16-NEXT: f64x2.add $push1=, $pop0, $2
+; NOFP16-NEXT: return $pop1
+;
+; NOSIMD-LABEL: fadd_fmul_2xf64:
+; NOSIMD: .functype fadd_fmul_2xf64 (i32, f64, f64, f64, f64, f64, f64) -> ()
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: f64.mul $push0=, $4, $2
+; NOSIMD-NEXT: f64.add $push1=, $pop0, $6
+; NOSIMD-NEXT: f64.store 8($0), $pop1
+; NOSIMD-NEXT: f64.mul $push2=, $3, $1
+; NOSIMD-NEXT: f64.add $push3=, $pop2, $5
+; NOSIMD-NEXT: f64.store 0($0), $pop3
+; NOSIMD-NEXT: return
+ %mul = fmul <2 x double> %b, %a
+ %add = fadd <2 x double> %mul, %c
+ ret <2 x double> %add
}
define float @fma_f32(float %a, float %b, float %c) {
@@ -247,6 +1476,18 @@ define float @fma_f32(float %a, float %b, float %c) {
; STRICT-NEXT: # %bb.0:
; STRICT-NEXT: call $push0=, fmaf, $0, $1, $2
; STRICT-NEXT: return $pop0
+;
+; NOFP16-LABEL: fma_f32:
+; NOFP16: .functype fma_f32 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: call $push0=, fmaf, $0, $1, $2
+; NOFP16-NEXT: return $pop0
+;
+; NOSIMD-LABEL: fma_f32:
+; NOSIMD: .functype fma_f32 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: call $push0=, fmaf, $0, $1, $2
+; NOSIMD-NEXT: return $pop0
%fma = call float @llvm.fma(float %a, float %b, float %c)
ret float %fma
}
@@ -263,6 +1504,18 @@ define double @fma_f64(double %a, double %b, double %c) {
; STRICT-NEXT: # %bb.0:
; STRICT-NEXT: call $push0=, fma, $0, $1, $2
; STRICT-NEXT: return $pop0
+;
+; NOFP16-LABEL: fma_f64:
+; NOFP16: .functype fma_f64 (f64, f64, f64) -> (f64)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: call $push0=, fma, $0, $1, $2
+; NOFP16-NEXT: return $pop0
+;
+; NOSIMD-LABEL: fma_f64:
+; NOSIMD: .functype fma_f64 (f64, f64, f64) -> (f64)
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: call $push0=, fma, $0, $1, $2
+; NOSIMD-NEXT: return $pop0
%fma = call double @llvm.fma(double %a, double %b, double %c)
ret double %fma
}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fnma.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fnma.ll
index 6e2d860..b90c1da 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fnma.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fnma.ll
@@ -27,7 +27,7 @@ define <4 x float> @fsub_fmul_contract_4xf32(<4 x float> %a, <4 x float> %b, <4
; RELAXED-LABEL: fsub_fmul_contract_4xf32:
; RELAXED: .functype fsub_fmul_contract_4xf32 (v128, v128, v128) -> (v128)
; RELAXED-NEXT: # %bb.0:
-; RELAXED-NEXT: f32x4.relaxed_nmadd $push0=, $2, $1, $0
+; RELAXED-NEXT: f32x4.relaxed_nmadd $push0=, $1, $0, $2
; RELAXED-NEXT: return $pop0
;
; STRICT-LABEL: fsub_fmul_contract_4xf32:
@@ -46,15 +46,14 @@ define <8 x half> @fsub_fmul_contract_8xf16(<8 x half> %a, <8 x half> %b, <8 x h
; RELAXED-LABEL: fsub_fmul_contract_8xf16:
; RELAXED: .functype fsub_fmul_contract_8xf16 (v128, v128, v128) -> (v128)
; RELAXED-NEXT: # %bb.0:
-; RELAXED-NEXT: f16x8.relaxed_nmadd $push0=, $2, $1, $0
+; RELAXED-NEXT: f16x8.nmadd $push0=, $1, $0, $2
; RELAXED-NEXT: return $pop0
;
; STRICT-LABEL: fsub_fmul_contract_8xf16:
; STRICT: .functype fsub_fmul_contract_8xf16 (v128, v128, v128) -> (v128)
; STRICT-NEXT: # %bb.0:
-; STRICT-NEXT: f16x8.mul $push0=, $1, $0
-; STRICT-NEXT: f16x8.sub $push1=, $2, $pop0
-; STRICT-NEXT: return $pop1
+; STRICT-NEXT: f16x8.nmadd $push0=, $1, $0, $2
+; STRICT-NEXT: return $pop0
%mul = fmul contract <8 x half> %b, %a
%sub = fsub contract <8 x half> %c, %mul
ret <8 x half> %sub
@@ -84,9 +83,9 @@ define <8 x float> @fsub_fmul_contract_8xf32(<8 x float> %a, <8 x float> %b, <8
; RELAXED-LABEL: fsub_fmul_contract_8xf32:
; RELAXED: .functype fsub_fmul_contract_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
; RELAXED-NEXT: # %bb.0:
-; RELAXED-NEXT: f32x4.relaxed_nmadd $push0=, $6, $4, $2
+; RELAXED-NEXT: f32x4.relaxed_nmadd $push0=, $4, $2, $6
; RELAXED-NEXT: v128.store 16($0), $pop0
-; RELAXED-NEXT: f32x4.relaxed_nmadd $push1=, $5, $3, $1
+; RELAXED-NEXT: f32x4.relaxed_nmadd $push1=, $3, $1, $5
; RELAXED-NEXT: v128.store 0($0), $pop1
; RELAXED-NEXT: return
;
@@ -110,7 +109,7 @@ define <2 x double> @fsub_fmul_contract_2xf64(<2 x double> %a, <2 x double> %b,
; RELAXED-LABEL: fsub_fmul_contract_2xf64:
; RELAXED: .functype fsub_fmul_contract_2xf64 (v128, v128, v128) -> (v128)
; RELAXED-NEXT: # %bb.0:
-; RELAXED-NEXT: f64x2.relaxed_nmadd $push0=, $2, $1, $0
+; RELAXED-NEXT: f64x2.relaxed_nmadd $push0=, $1, $0, $2
; RELAXED-NEXT: return $pop0
;
; STRICT-LABEL: fsub_fmul_contract_2xf64:
@@ -143,3 +142,55 @@ define float @fsub_fmul_contract_f32(float %a, float %b, float %c) {
ret float %sub
}
+define <8 x half> @fmuladd_8xf16(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
+; RELAXED-LABEL: fmuladd_8xf16:
+; RELAXED: .functype fmuladd_8xf16 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT: # %bb.0:
+; RELAXED-NEXT: f16x8.nmadd $push0=, $0, $1, $2
+; RELAXED-NEXT: return $pop0
+;
+; STRICT-LABEL: fmuladd_8xf16:
+; STRICT: .functype fmuladd_8xf16 (v128, v128, v128) -> (v128)
+; STRICT-NEXT: # %bb.0:
+; STRICT-NEXT: f16x8.nmadd $push0=, $0, $1, $2
+; STRICT-NEXT: return $pop0
+ %fneg = fneg <8 x half> %a
+ %fma = call <8 x half> @llvm.fmuladd(<8 x half> %fneg, <8 x half> %b, <8 x half> %c)
+ ret <8 x half> %fma
+}
+
+define <4 x float> @fmuladd_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; RELAXED-LABEL: fmuladd_4xf32:
+; RELAXED: .functype fmuladd_4xf32 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT: # %bb.0:
+; RELAXED-NEXT: f32x4.relaxed_nmadd $push0=, $0, $1, $2
+; RELAXED-NEXT: return $pop0
+;
+; STRICT-LABEL: fmuladd_4xf32:
+; STRICT: .functype fmuladd_4xf32 (v128, v128, v128) -> (v128)
+; STRICT-NEXT: # %bb.0:
+; STRICT-NEXT: f32x4.mul $push0=, $0, $1
+; STRICT-NEXT: f32x4.sub $push1=, $2, $pop0
+; STRICT-NEXT: return $pop1
+ %fneg = fneg <4 x float> %a
+ %fma = call <4 x float> @llvm.fmuladd(<4 x float> %fneg, <4 x float> %b, <4 x float> %c)
+ ret <4 x float> %fma
+}
+
+define <2 x double> @fmuladd_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; RELAXED-LABEL: fmuladd_2xf64:
+; RELAXED: .functype fmuladd_2xf64 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT: # %bb.0:
+; RELAXED-NEXT: f64x2.relaxed_nmadd $push0=, $0, $1, $2
+; RELAXED-NEXT: return $pop0
+;
+; STRICT-LABEL: fmuladd_2xf64:
+; STRICT: .functype fmuladd_2xf64 (v128, v128, v128) -> (v128)
+; STRICT-NEXT: # %bb.0:
+; STRICT-NEXT: f64x2.mul $push0=, $0, $1
+; STRICT-NEXT: f64x2.sub $push1=, $2, $pop0
+; STRICT-NEXT: return $pop1
+ %fneg = fneg <2 x double> %a
+ %fma = call <2 x double> @llvm.fmuladd(<2 x double> %fneg, <2 x double> %b, <2 x double> %c)
+ ret <2 x double> %fma
+}
diff --git a/llvm/test/CodeGen/X86/2007-08-09-IllegalX86-64Asm.ll b/llvm/test/CodeGen/X86/2007-08-09-IllegalX86-64Asm.ll
index 28b4541..7bdc4e1 100644
--- a/llvm/test/CodeGen/X86/2007-08-09-IllegalX86-64Asm.ll
+++ b/llvm/test/CodeGen/X86/2007-08-09-IllegalX86-64Asm.ll
@@ -44,7 +44,7 @@ define ptr @ubyte_divmod(ptr %a, ptr %b) {
; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
; CHECK-NEXT: callq __ubyte_convert_to_ctype
; CHECK-NEXT: testl %eax, %eax
-; CHECK-NEXT: js LBB0_6
+; CHECK-NEXT: js LBB0_4
; CHECK-NEXT: ## %bb.1: ## %cond_next.i
; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
; CHECK-NEXT: movq %rbx, %rdi
@@ -53,84 +53,81 @@ define ptr @ubyte_divmod(ptr %a, ptr %b) {
; CHECK-NEXT: sarl $31, %ecx
; CHECK-NEXT: andl %eax, %ecx
; CHECK-NEXT: cmpl $-2, %ecx
-; CHECK-NEXT: je LBB0_10
+; CHECK-NEXT: je LBB0_8
; CHECK-NEXT: ## %bb.2: ## %cond_next.i
; CHECK-NEXT: cmpl $-1, %ecx
-; CHECK-NEXT: jne LBB0_3
-; CHECK-NEXT: LBB0_8: ## %bb4
+; CHECK-NEXT: jne LBB0_6
+; CHECK-NEXT: LBB0_3: ## %bb4
; CHECK-NEXT: movq _PyArray_API@GOTPCREL(%rip), %rax
; CHECK-NEXT: movq (%rax), %rax
; CHECK-NEXT: movq 16(%rax), %rax
-; CHECK-NEXT: jmp LBB0_9
-; CHECK-NEXT: LBB0_6: ## %_ubyte_convert2_to_ctypes.exit
+; CHECK-NEXT: jmp LBB0_10
+; CHECK-NEXT: LBB0_4: ## %_ubyte_convert2_to_ctypes.exit
; CHECK-NEXT: cmpl $-2, %eax
-; CHECK-NEXT: je LBB0_10
-; CHECK-NEXT: ## %bb.7: ## %_ubyte_convert2_to_ctypes.exit
-; CHECK-NEXT: cmpl $-1, %eax
; CHECK-NEXT: je LBB0_8
-; CHECK-NEXT: LBB0_3: ## %bb35
+; CHECK-NEXT: ## %bb.5: ## %_ubyte_convert2_to_ctypes.exit
+; CHECK-NEXT: cmpl $-1, %eax
+; CHECK-NEXT: je LBB0_3
+; CHECK-NEXT: LBB0_6: ## %bb35
; CHECK-NEXT: movq _PyUFunc_API@GOTPCREL(%rip), %r14
; CHECK-NEXT: movq (%r14), %rax
; CHECK-NEXT: callq *216(%rax)
; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edx
; CHECK-NEXT: testb %dl, %dl
-; CHECK-NEXT: je LBB0_4
-; CHECK-NEXT: ## %bb.12: ## %cond_false.i
-; CHECK-NEXT: setne %dil
+; CHECK-NEXT: je LBB0_11
+; CHECK-NEXT: ## %bb.7: ## %cond_false.i
; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %esi
; CHECK-NEXT: movzbl %sil, %ecx
; CHECK-NEXT: movl %ecx, %eax
; CHECK-NEXT: divb %dl
; CHECK-NEXT: movl %eax, %r15d
; CHECK-NEXT: testb %cl, %cl
-; CHECK-NEXT: setne %al
-; CHECK-NEXT: testb %dil, %al
-; CHECK-NEXT: jne LBB0_5
-; CHECK-NEXT: LBB0_13: ## %cond_true.i200
-; CHECK-NEXT: testb %dl, %dl
-; CHECK-NEXT: jne LBB0_15
-; CHECK-NEXT: ## %bb.14: ## %cond_true14.i
-; CHECK-NEXT: movl $4, %edi
-; CHECK-NEXT: callq _feraiseexcept
-; CHECK-NEXT: LBB0_15: ## %ubyte_ctype_remainder.exit
-; CHECK-NEXT: xorl %ebx, %ebx
-; CHECK-NEXT: jmp LBB0_16
-; CHECK-NEXT: LBB0_10: ## %bb17
+; CHECK-NEXT: jne LBB0_12
+; CHECK-NEXT: jmp LBB0_14
+; CHECK-NEXT: LBB0_8: ## %bb17
; CHECK-NEXT: callq _PyErr_Occurred
; CHECK-NEXT: testq %rax, %rax
-; CHECK-NEXT: jne LBB0_23
-; CHECK-NEXT: ## %bb.11: ## %cond_next
+; CHECK-NEXT: jne LBB0_27
+; CHECK-NEXT: ## %bb.9: ## %cond_next
; CHECK-NEXT: movq _PyArray_API@GOTPCREL(%rip), %rax
; CHECK-NEXT: movq (%rax), %rax
; CHECK-NEXT: movq 80(%rax), %rax
-; CHECK-NEXT: LBB0_9: ## %bb4
+; CHECK-NEXT: LBB0_10: ## %bb4
; CHECK-NEXT: movq 96(%rax), %rax
; CHECK-NEXT: movq %r14, %rdi
; CHECK-NEXT: movq %rbx, %rsi
; CHECK-NEXT: callq *40(%rax)
-; CHECK-NEXT: jmp LBB0_24
-; CHECK-NEXT: LBB0_4: ## %cond_true.i
+; CHECK-NEXT: jmp LBB0_28
+; CHECK-NEXT: LBB0_11: ## %cond_true.i
; CHECK-NEXT: movl $4, %edi
; CHECK-NEXT: callq _feraiseexcept
; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edx
; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %esi
+; CHECK-NEXT: xorl %r15d, %r15d
; CHECK-NEXT: testb %sil, %sil
-; CHECK-NEXT: sete %al
+; CHECK-NEXT: je LBB0_14
+; CHECK-NEXT: LBB0_12: ## %cond_false.i
; CHECK-NEXT: testb %dl, %dl
-; CHECK-NEXT: sete %cl
-; CHECK-NEXT: xorl %r15d, %r15d
-; CHECK-NEXT: orb %al, %cl
-; CHECK-NEXT: jne LBB0_13
-; CHECK-NEXT: LBB0_5: ## %cond_next17.i
+; CHECK-NEXT: je LBB0_14
+; CHECK-NEXT: ## %bb.13: ## %cond_next17.i
; CHECK-NEXT: movzbl %sil, %eax
; CHECK-NEXT: divb %dl
; CHECK-NEXT: movzbl %ah, %ebx
-; CHECK-NEXT: LBB0_16: ## %ubyte_ctype_remainder.exit
+; CHECK-NEXT: jmp LBB0_18
+; CHECK-NEXT: LBB0_14: ## %cond_true.i200
+; CHECK-NEXT: testb %dl, %dl
+; CHECK-NEXT: jne LBB0_17
+; CHECK-NEXT: ## %bb.16: ## %cond_true14.i
+; CHECK-NEXT: movl $4, %edi
+; CHECK-NEXT: callq _feraiseexcept
+; CHECK-NEXT: LBB0_17: ## %ubyte_ctype_remainder.exit
+; CHECK-NEXT: xorl %ebx, %ebx
+; CHECK-NEXT: LBB0_18: ## %ubyte_ctype_remainder.exit
; CHECK-NEXT: movq (%r14), %rax
; CHECK-NEXT: callq *224(%rax)
; CHECK-NEXT: testl %eax, %eax
-; CHECK-NEXT: je LBB0_19
-; CHECK-NEXT: ## %bb.17: ## %cond_true61
+; CHECK-NEXT: je LBB0_21
+; CHECK-NEXT: ## %bb.19: ## %cond_true61
; CHECK-NEXT: movl %eax, %ebp
; CHECK-NEXT: movq (%r14), %rax
; CHECK-NEXT: movq _.str5@GOTPCREL(%rip), %rdi
@@ -139,8 +136,8 @@ define ptr @ubyte_divmod(ptr %a, ptr %b) {
; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
; CHECK-NEXT: callq *200(%rax)
; CHECK-NEXT: testl %eax, %eax
-; CHECK-NEXT: js LBB0_23
-; CHECK-NEXT: ## %bb.18: ## %cond_next73
+; CHECK-NEXT: js LBB0_27
+; CHECK-NEXT: ## %bb.20: ## %cond_next73
; CHECK-NEXT: movl $1, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movq (%r14), %rax
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rsi
@@ -149,13 +146,13 @@ define ptr @ubyte_divmod(ptr %a, ptr %b) {
; CHECK-NEXT: movl %ebp, %edx
; CHECK-NEXT: callq *232(%rax)
; CHECK-NEXT: testl %eax, %eax
-; CHECK-NEXT: jne LBB0_23
-; CHECK-NEXT: LBB0_19: ## %cond_next89
+; CHECK-NEXT: jne LBB0_27
+; CHECK-NEXT: LBB0_21: ## %cond_next89
; CHECK-NEXT: movl $2, %edi
; CHECK-NEXT: callq _PyTuple_New
; CHECK-NEXT: testq %rax, %rax
-; CHECK-NEXT: je LBB0_23
-; CHECK-NEXT: ## %bb.20: ## %cond_next97
+; CHECK-NEXT: je LBB0_27
+; CHECK-NEXT: ## %bb.22: ## %cond_next97
; CHECK-NEXT: movq %rax, %r14
; CHECK-NEXT: movq _PyArray_API@GOTPCREL(%rip), %r12
; CHECK-NEXT: movq (%r12), %rax
@@ -163,8 +160,8 @@ define ptr @ubyte_divmod(ptr %a, ptr %b) {
; CHECK-NEXT: xorl %esi, %esi
; CHECK-NEXT: callq *304(%rdi)
; CHECK-NEXT: testq %rax, %rax
-; CHECK-NEXT: je LBB0_21
-; CHECK-NEXT: ## %bb.25: ## %cond_next135
+; CHECK-NEXT: je LBB0_25
+; CHECK-NEXT: ## %bb.23: ## %cond_next135
; CHECK-NEXT: movb %r15b, 16(%rax)
; CHECK-NEXT: movq %rax, 24(%r14)
; CHECK-NEXT: movq (%r12), %rax
@@ -172,22 +169,22 @@ define ptr @ubyte_divmod(ptr %a, ptr %b) {
; CHECK-NEXT: xorl %esi, %esi
; CHECK-NEXT: callq *304(%rdi)
; CHECK-NEXT: testq %rax, %rax
-; CHECK-NEXT: je LBB0_21
-; CHECK-NEXT: ## %bb.26: ## %cond_next182
+; CHECK-NEXT: je LBB0_25
+; CHECK-NEXT: ## %bb.24: ## %cond_next182
; CHECK-NEXT: movb %bl, 16(%rax)
; CHECK-NEXT: movq %rax, 32(%r14)
; CHECK-NEXT: movq %r14, %rax
-; CHECK-NEXT: jmp LBB0_24
-; CHECK-NEXT: LBB0_21: ## %cond_true113
+; CHECK-NEXT: jmp LBB0_28
+; CHECK-NEXT: LBB0_25: ## %cond_true113
; CHECK-NEXT: decq (%r14)
-; CHECK-NEXT: jne LBB0_23
-; CHECK-NEXT: ## %bb.22: ## %cond_true126
+; CHECK-NEXT: jne LBB0_27
+; CHECK-NEXT: ## %bb.26: ## %cond_true126
; CHECK-NEXT: movq 8(%r14), %rax
; CHECK-NEXT: movq %r14, %rdi
; CHECK-NEXT: callq *48(%rax)
-; CHECK-NEXT: LBB0_23: ## %UnifiedReturnBlock
+; CHECK-NEXT: LBB0_27: ## %UnifiedReturnBlock
; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: LBB0_24: ## %UnifiedReturnBlock
+; CHECK-NEXT: LBB0_28: ## %UnifiedReturnBlock
; CHECK-NEXT: addq $32, %rsp
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r12
diff --git a/llvm/test/CodeGen/X86/GlobalISel/add-scalar.ll b/llvm/test/CodeGen/X86/GlobalISel/add-scalar.ll
index 7bde1b7..7cdfd51 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/add-scalar.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/add-scalar.ll
@@ -7,12 +7,15 @@ define i128 @test_add_i128(i128 %arg1, i128 %arg2) nounwind {
; X64: # %bb.0:
; X64-NEXT: movq %rdx, %rax
; X64-NEXT: addq %rdi, %rax
+; X64-NEXT: setb %dl
+; X64-NEXT: cmpb $1, %dl
; X64-NEXT: adcq %rsi, %rcx
; X64-NEXT: movq %rcx, %rdx
; X64-NEXT: retq
;
; X86-LABEL: test_add_i128:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -21,8 +24,14 @@ define i128 @test_add_i128(i128 %arg1, i128 %arg2) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: setb %bl
+; X86-NEXT: cmpb $1, %bl
; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: setb %bl
+; X86-NEXT: cmpb $1, %bl
; X86-NEXT: adcl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: setb %bl
+; X86-NEXT: cmpb $1, %bl
; X86-NEXT: adcl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl %ecx, (%eax)
; X86-NEXT: movl %edx, 4(%eax)
@@ -30,6 +39,7 @@ define i128 @test_add_i128(i128 %arg1, i128 %arg2) nounwind {
; X86-NEXT: movl %edi, 12(%eax)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
; X86-NEXT: retl
%ret = add i128 %arg1, %arg2
ret i128 %ret
@@ -46,6 +56,8 @@ define i64 @test_add_i64(i64 %arg1, i64 %arg2) {
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: addl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: setb %cl
+; X86-NEXT: cmpb $1, %cl
; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx
; X86-NEXT: retl
%ret = add i64 %arg1, %arg2
diff --git a/llvm/test/CodeGen/X86/GlobalISel/legalize-add.mir b/llvm/test/CodeGen/X86/GlobalISel/legalize-add.mir
index ec9db78..dae2ad6 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/legalize-add.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/legalize-add.mir
@@ -157,8 +157,8 @@ body: |
; X86: [[COPY:%[0-9]+]]:_(s64) = COPY $rdx
; X86-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
; X86-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
- ; X86-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV]], [[UV2]]
- ; X86-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV1]], [[UV3]], [[UADDO1]]
+ ; X86-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s8) = G_UADDO [[UV]], [[UV2]]
+ ; X86-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s8) = G_UADDE [[UV1]], [[UV3]], [[UADDO1]]
; X86-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32)
; X86-NEXT: $rax = COPY [[MV]](s64)
; X86-NEXT: RET 0
@@ -192,8 +192,8 @@ body: |
; X86-NEXT: [[DEF1:%[0-9]+]]:_(s64) = IMPLICIT_DEF
; X86-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](s64)
; X86-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF1]](s64)
- ; X86-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV]], [[UV2]]
- ; X86-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV1]], [[UV3]], [[UADDO1]]
+ ; X86-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s8) = G_UADDO [[UV]], [[UV2]]
+ ; X86-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s8) = G_UADDE [[UV1]], [[UV3]], [[UADDO1]]
; X86-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32)
; X86-NEXT: $rax = COPY [[MV]](s64)
; X86-NEXT: RET 0
@@ -219,8 +219,8 @@ body: |
; X64-NEXT: [[DEF1:%[0-9]+]]:_(s128) = IMPLICIT_DEF
; X64-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](s128)
; X64-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF1]](s128)
- ; X64-NEXT: [[UADDO:%[0-9]+]]:_(s64), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV]], [[UV2]]
- ; X64-NEXT: [[UADDE:%[0-9]+]]:_(s64), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV1]], [[UV3]], [[UADDO1]]
+ ; X64-NEXT: [[UADDO:%[0-9]+]]:_(s64), [[UADDO1:%[0-9]+]]:_(s8) = G_UADDO [[UV]], [[UV2]]
+ ; X64-NEXT: [[UADDE:%[0-9]+]]:_(s64), [[UADDE1:%[0-9]+]]:_(s8) = G_UADDE [[UV1]], [[UV3]], [[UADDO1]]
; X64-NEXT: $rax = COPY [[UADDO]](s64)
; X64-NEXT: $rdx = COPY [[UADDE]](s64)
; X64-NEXT: RET 0
@@ -230,10 +230,10 @@ body: |
; X86-NEXT: [[DEF1:%[0-9]+]]:_(s128) = IMPLICIT_DEF
; X86-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](s128)
; X86-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF1]](s128)
- ; X86-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV]], [[UV4]]
- ; X86-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV1]], [[UV5]], [[UADDO1]]
- ; X86-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV2]], [[UV6]], [[UADDE1]]
- ; X86-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[UV3]], [[UV7]], [[UADDE3]]
+ ; X86-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s8) = G_UADDO [[UV]], [[UV4]]
+ ; X86-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s8) = G_UADDE [[UV1]], [[UV5]], [[UADDO1]]
+ ; X86-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s8) = G_UADDE [[UV2]], [[UV6]], [[UADDE1]]
+ ; X86-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s8) = G_UADDE [[UV3]], [[UV7]], [[UADDE3]]
; X86-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32)
; X86-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDE2]](s32), [[UADDE4]](s32)
; X86-NEXT: $rax = COPY [[MV]](s64)
diff --git a/llvm/test/CodeGen/X86/GlobalISel/legalize-leading-zeros.mir b/llvm/test/CodeGen/X86/GlobalISel/legalize-leading-zeros.mir
index 19fe5b8..470a30fd 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/legalize-leading-zeros.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/legalize-leading-zeros.mir
@@ -25,6 +25,7 @@ body: |
; X64-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[CTLZ]], [[C1]]
; X64-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C]]
; X64-NEXT: RET 0, implicit [[AND1]](s64)
+ ;
; X86-LABEL: name: test_ctlz35
; X86: [[COPY:%[0-9]+]]:_(s64) = COPY $rdx
; X86-NEXT: [[TRUNC:%[0-9]+]]:_(s35) = G_TRUNC [[COPY]](s64)
@@ -46,12 +47,15 @@ body: |
; X86-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C2]](s32), [[C]](s32)
; X86-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s64)
; X86-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV1]](s64)
- ; X86-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[UV6]], [[UV8]]
- ; X86-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[UV9]], [[USUBO1]]
+ ; X86-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s8) = G_USUBO [[UV6]], [[UV8]]
+ ; X86-NEXT: [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[USUBO1]](s8)
+ ; X86-NEXT: [[ZEXT2:%[0-9]+]]:_(s8) = G_ZEXT [[TRUNC1]](s1)
+ ; X86-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s8) = G_USUBE [[UV7]], [[UV9]], [[ZEXT2]]
+ ; X86-NEXT: [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[USUBE1]](s8)
; X86-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32)
- ; X86-NEXT: [[TRUNC1:%[0-9]+]]:_(s35) = G_TRUNC [[MV2]](s64)
- ; X86-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC1]](s35)
- ; X86-NEXT: RET 0, implicit [[ZEXT2]](s64)
+ ; X86-NEXT: [[TRUNC3:%[0-9]+]]:_(s35) = G_TRUNC [[MV2]](s64)
+ ; X86-NEXT: [[ZEXT3:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC3]](s35)
+ ; X86-NEXT: RET 0, implicit [[ZEXT3]](s64)
%0(s64) = COPY $rdx
%1:_(s35) = G_TRUNC %0(s64)
%2:_(s35) = G_CTLZ %1
@@ -97,6 +101,7 @@ body: |
; X64-NEXT: [[CTLZ:%[0-9]+]]:_(s64) = G_CTLZ [[DEF]](s64)
; X64-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY [[CTLZ]](s64)
; X64-NEXT: RET 0, implicit [[COPY]](s64)
+ ;
; X86-LABEL: name: test_ctlz64
; X86: [[DEF:%[0-9]+]]:_(s64) = IMPLICIT_DEF
; X86-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](s64)
diff --git a/llvm/test/CodeGen/X86/GlobalISel/legalize-sub.mir b/llvm/test/CodeGen/X86/GlobalISel/legalize-sub.mir
index ee2b9ee..ac3bf33 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/legalize-sub.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/legalize-sub.mir
@@ -157,8 +157,8 @@ body: |
; X86: [[COPY:%[0-9]+]]:_(s64) = COPY $rdx
; X86-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
; X86-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
- ; X86-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[UV]], [[UV2]]
- ; X86-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV1]], [[UV3]], [[USUBO1]]
+ ; X86-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s8) = G_USUBO [[UV]], [[UV2]]
+ ; X86-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s8) = G_USUBE [[UV1]], [[UV3]], [[USUBO1]]
; X86-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32)
; X86-NEXT: $rax = COPY [[MV]](s64)
; X86-NEXT: RET 0
@@ -192,8 +192,8 @@ body: |
; X86-NEXT: [[DEF1:%[0-9]+]]:_(s64) = IMPLICIT_DEF
; X86-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](s64)
; X86-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF1]](s64)
- ; X86-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[UV]], [[UV2]]
- ; X86-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV1]], [[UV3]], [[USUBO1]]
+ ; X86-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s8) = G_USUBO [[UV]], [[UV2]]
+ ; X86-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s8) = G_USUBE [[UV1]], [[UV3]], [[USUBO1]]
; X86-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32)
; X86-NEXT: $rax = COPY [[MV]](s64)
; X86-NEXT: RET 0
@@ -219,8 +219,8 @@ body: |
; X64-NEXT: [[DEF1:%[0-9]+]]:_(s128) = IMPLICIT_DEF
; X64-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](s128)
; X64-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF1]](s128)
- ; X64-NEXT: [[USUBO:%[0-9]+]]:_(s64), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[UV]], [[UV2]]
- ; X64-NEXT: [[USUBE:%[0-9]+]]:_(s64), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV1]], [[UV3]], [[USUBO1]]
+ ; X64-NEXT: [[USUBO:%[0-9]+]]:_(s64), [[USUBO1:%[0-9]+]]:_(s8) = G_USUBO [[UV]], [[UV2]]
+ ; X64-NEXT: [[USUBE:%[0-9]+]]:_(s64), [[USUBE1:%[0-9]+]]:_(s8) = G_USUBE [[UV1]], [[UV3]], [[USUBO1]]
; X64-NEXT: $rax = COPY [[USUBO]](s64)
; X64-NEXT: $rdx = COPY [[USUBE]](s64)
; X64-NEXT: RET 0
@@ -230,10 +230,10 @@ body: |
; X86-NEXT: [[DEF1:%[0-9]+]]:_(s128) = IMPLICIT_DEF
; X86-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](s128)
; X86-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF1]](s128)
- ; X86-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[UV]], [[UV4]]
- ; X86-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV1]], [[UV5]], [[USUBO1]]
- ; X86-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV2]], [[UV6]], [[USUBE1]]
- ; X86-NEXT: [[USUBE4:%[0-9]+]]:_(s32), [[USUBE5:%[0-9]+]]:_(s1) = G_USUBE [[UV3]], [[UV7]], [[USUBE3]]
+ ; X86-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s8) = G_USUBO [[UV]], [[UV4]]
+ ; X86-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s8) = G_USUBE [[UV1]], [[UV5]], [[USUBO1]]
+ ; X86-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s8) = G_USUBE [[UV2]], [[UV6]], [[USUBE1]]
+ ; X86-NEXT: [[USUBE4:%[0-9]+]]:_(s32), [[USUBE5:%[0-9]+]]:_(s8) = G_USUBE [[UV3]], [[UV7]], [[USUBE3]]
; X86-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32)
; X86-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBE2]](s32), [[USUBE4]](s32)
; X86-NEXT: $rax = COPY [[MV]](s64)
diff --git a/llvm/test/CodeGen/X86/GlobalISel/legalize-trailing-zeros-undef.mir b/llvm/test/CodeGen/X86/GlobalISel/legalize-trailing-zeros-undef.mir
index 9807d13..57e729f 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/legalize-trailing-zeros-undef.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/legalize-trailing-zeros-undef.mir
@@ -32,8 +32,8 @@ body: |
; X86-NEXT: [[ICMP:%[0-9]+]]:_(s8) = G_ICMP intpred(eq), [[OR]](s32), [[C]]
; X86-NEXT: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[OR1]](s32)
; X86-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
- ; X86-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[CTTZ_ZERO_UNDEF]], [[C2]]
- ; X86-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[C]], [[C]], [[UADDO1]]
+ ; X86-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s8) = G_UADDO [[CTTZ_ZERO_UNDEF]], [[C2]]
+ ; X86-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s8) = G_UADDE [[C]], [[C]], [[UADDO1]]
; X86-NEXT: [[CTTZ_ZERO_UNDEF1:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[OR]](s32)
; X86-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s8)
; X86-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
@@ -97,8 +97,8 @@ body: |
; X86-NEXT: [[ICMP:%[0-9]+]]:_(s8) = G_ICMP intpred(eq), [[UV]](s32), [[C]]
; X86-NEXT: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[UV1]](s32)
; X86-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
- ; X86-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[CTTZ_ZERO_UNDEF]], [[C1]]
- ; X86-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[C]], [[C]], [[UADDO1]]
+ ; X86-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s8) = G_UADDO [[CTTZ_ZERO_UNDEF]], [[C1]]
+ ; X86-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s8) = G_UADDE [[C]], [[C]], [[UADDO1]]
; X86-NEXT: [[CTTZ_ZERO_UNDEF1:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[UV]](s32)
; X86-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s8)
; X86-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
diff --git a/llvm/test/CodeGen/X86/GlobalISel/legalize-trailing-zeros.mir b/llvm/test/CodeGen/X86/GlobalISel/legalize-trailing-zeros.mir
index e2d10423..f5d8477 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/legalize-trailing-zeros.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/legalize-trailing-zeros.mir
@@ -32,8 +32,8 @@ body: |
; X86-NEXT: [[ICMP:%[0-9]+]]:_(s8) = G_ICMP intpred(eq), [[OR]](s32), [[C]]
; X86-NEXT: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[OR1]](s32)
; X86-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
- ; X86-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[CTTZ_ZERO_UNDEF]], [[C2]]
- ; X86-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[C]], [[C]], [[UADDO1]]
+ ; X86-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s8) = G_UADDO [[CTTZ_ZERO_UNDEF]], [[C2]]
+ ; X86-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s8) = G_UADDE [[C]], [[C]], [[UADDO1]]
; X86-NEXT: [[CTTZ_ZERO_UNDEF1:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[OR]](s32)
; X86-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s8)
; X86-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
@@ -99,8 +99,8 @@ body: |
; X86-NEXT: [[ICMP:%[0-9]+]]:_(s8) = G_ICMP intpred(eq), [[UV]](s32), [[C]]
; X86-NEXT: [[CTTZ:%[0-9]+]]:_(s32) = G_CTTZ [[UV1]](s32)
; X86-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
- ; X86-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[CTTZ]], [[C1]]
- ; X86-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[C]], [[C]], [[UADDO1]]
+ ; X86-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s8) = G_UADDO [[CTTZ]], [[C1]]
+ ; X86-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s8) = G_UADDE [[C]], [[C]], [[UADDO1]]
; X86-NEXT: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[UV]](s32)
; X86-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s8)
; X86-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
diff --git a/llvm/test/CodeGen/X86/GlobalISel/pr49087.ll b/llvm/test/CodeGen/X86/GlobalISel/pr49087.ll
new file mode 100644
index 0000000..41d890b
--- /dev/null
+++ b/llvm/test/CodeGen/X86/GlobalISel/pr49087.ll
@@ -0,0 +1,50 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -o - -global-isel -global-isel-abort=1 < %s 2>&1 | FileCheck %s
+
+define i32 @test_01(ptr %p, i64 %len, i32 %x) {
+; CHECK-LABEL: test_01:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movl $1, %eax
+; CHECK-NEXT: .p2align 4
+; CHECK-NEXT: .LBB0_1: # %loop
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: subq %rax, %rsi
+; CHECK-NEXT: setb %cl
+; CHECK-NEXT: testb $1, %cl
+; CHECK-NEXT: jne .LBB0_4
+; CHECK-NEXT: # %bb.2: # %backedge
+; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT: imulq $4, %rsi, %rcx
+; CHECK-NEXT: addq %rdi, %rcx
+; CHECK-NEXT: cmpl %edx, (%rcx)
+; CHECK-NEXT: sete %cl
+; CHECK-NEXT: testb $1, %cl
+; CHECK-NEXT: je .LBB0_1
+; CHECK-NEXT: # %bb.3: # %failure
+; CHECK-NEXT: .LBB0_4: # %exit
+; CHECK-NEXT: movl $-1, %eax
+; CHECK-NEXT: retq
+
+entry:
+ %scevgep = getelementptr i32, ptr %p, i64 -1
+ br label %loop
+
+loop: ; preds = %backedge, %entry
+ %iv = phi i64 [ %iv.next, %backedge ], [ %len, %entry ]
+ %iv.next = add i64 %iv, -1
+ %cond_1 = icmp eq i64 %iv, 0
+ br i1 %cond_1, label %exit, label %backedge
+
+backedge: ; preds = %loop
+ %scevgep1 = getelementptr i32, ptr %scevgep, i64 %iv
+ %loaded = load atomic i32, ptr %scevgep1 unordered, align 4
+ %cond_2 = icmp eq i32 %loaded, %x
+ br i1 %cond_2, label %failure, label %loop
+
+exit: ; preds = %loop
+ ret i32 -1
+
+failure:
+ unreachable
+}
+
diff --git a/llvm/test/CodeGen/X86/GlobalISel/regbankselect-X32.mir b/llvm/test/CodeGen/X86/GlobalISel/regbankselect-X32.mir
index 8eac3eaf..76680ac 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/regbankselect-X32.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/regbankselect-X32.mir
@@ -29,8 +29,8 @@ body: |
bb.0 (%ir-block.0):
%0(s32) = IMPLICIT_DEF
%1(s32) = IMPLICIT_DEF
- %2(s1) = IMPLICIT_DEF
- %3(s32), %4(s1) = G_UADDE %0, %1, %2
+ %2(s8) = IMPLICIT_DEF
+ %3(s32), %4(s8) = G_UADDE %0, %1, %2
RET 0
...
diff --git a/llvm/test/CodeGen/X86/GlobalISel/select-add-x32.mir b/llvm/test/CodeGen/X86/GlobalISel/select-add-x32.mir
index 773813f..b85180f 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/select-add-x32.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/select-add-x32.mir
@@ -27,25 +27,24 @@ body: |
bb.0 (%ir-block.0):
; X32-LABEL: name: test_add_i64
; X32: [[DEF:%[0-9]+]]:gr32 = IMPLICIT_DEF
- ; X32: [[DEF1:%[0-9]+]]:gr32 = IMPLICIT_DEF
- ; X32: [[DEF2:%[0-9]+]]:gr32 = IMPLICIT_DEF
- ; X32: [[DEF3:%[0-9]+]]:gr32 = IMPLICIT_DEF
- ; X32: [[ADD32rr:%[0-9]+]]:gr32 = ADD32rr [[DEF]], [[DEF2]], implicit-def $eflags
- ; X32: [[COPY:%[0-9]+]]:gr32 = COPY $eflags
- ; X32: $eflags = COPY [[COPY]]
- ; X32: [[ADC32rr:%[0-9]+]]:gr32 = ADC32rr [[DEF1]], [[DEF3]], implicit-def $eflags, implicit $eflags
- ; X32: [[COPY1:%[0-9]+]]:gr32 = COPY $eflags
- ; X32: $eax = COPY [[ADD32rr]]
- ; X32: $edx = COPY [[ADC32rr]]
- ; X32: RET 0, implicit $eax, implicit $edx
+ ; X32-NEXT: [[DEF1:%[0-9]+]]:gr32 = IMPLICIT_DEF
+ ; X32-NEXT: [[DEF2:%[0-9]+]]:gr32 = IMPLICIT_DEF
+ ; X32-NEXT: [[DEF3:%[0-9]+]]:gr32 = IMPLICIT_DEF
+ ; X32-NEXT: [[ADD32rr:%[0-9]+]]:gr32 = ADD32rr [[DEF]], [[DEF2]], implicit-def $eflags
+ ; X32-NEXT: [[SETCCr:%[0-9]+]]:gr8 = SETCCr 2, implicit $eflags
+ ; X32-NEXT: CMP8ri [[SETCCr]], 1, implicit-def $eflags
+ ; X32-NEXT: [[ADC32rr:%[0-9]+]]:gr32 = ADC32rr [[DEF1]], [[DEF3]], implicit-def $eflags, implicit $eflags
+ ; X32-NEXT: [[SETCCr1:%[0-9]+]]:gr8 = SETCCr 2, implicit $eflags
+ ; X32-NEXT: $eax = COPY [[ADD32rr]]
+ ; X32-NEXT: $edx = COPY [[ADC32rr]]
+ ; X32-NEXT: RET 0, implicit $eax, implicit $edx
%0(s32) = IMPLICIT_DEF
%1(s32) = IMPLICIT_DEF
%2(s32) = IMPLICIT_DEF
%3(s32) = IMPLICIT_DEF
%9(s8) = G_CONSTANT i8 0
- %4(s1) = G_TRUNC %9(s8)
- %5(s32), %6(s1) = G_UADDE %0, %2, %4
- %7(s32), %8(s1) = G_UADDE %1, %3, %6
+ %5(s32), %6(s8) = G_UADDE %0, %2, %9
+ %7(s32), %8(s8) = G_UADDE %1, %3, %6
$eax = COPY %5(s32)
$edx = COPY %7(s32)
RET 0, implicit $eax, implicit $edx
diff --git a/llvm/test/CodeGen/X86/GlobalISel/select-get-carry-bit.ll b/llvm/test/CodeGen/X86/GlobalISel/select-get-carry-bit.ll
new file mode 100644
index 0000000..0cf1372
--- /dev/null
+++ b/llvm/test/CodeGen/X86/GlobalISel/select-get-carry-bit.ll
@@ -0,0 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -global-isel=1 -global-isel-abort=1 | FileCheck %s
+
+; Issue #120029
+define i16 @use_carry_bit(i16 %2) {
+; CHECK-LABEL: use_carry_bit:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movw $1, %ax
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: addw %di, %ax
+; CHECK-NEXT: setb %cl
+; CHECK-NEXT: andl $1, %ecx
+; CHECK-NEXT: cmovnew %di, %ax
+; CHECK-NEXT: retq
+ %uadd = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 %2, i16 1)
+ %res = extractvalue { i16, i1 } %uadd, 0
+ %carry = extractvalue { i16, i1 } %uadd, 1
+ %ret = select i1 %carry, i16 %2, i16 %res
+ ret i16 %ret
+}
+
diff --git a/llvm/test/CodeGen/X86/GlobalISel/sub-scalar.ll b/llvm/test/CodeGen/X86/GlobalISel/sub-scalar.ll
index 7a035f5..be75d7c 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/sub-scalar.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/sub-scalar.ll
@@ -7,12 +7,15 @@ define i128 @test_sub_i128(i128 %arg1, i128 %arg2) nounwind {
; X64: # %bb.0:
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: subq %rdx, %rax
+; X64-NEXT: setb %dl
+; X64-NEXT: cmpb $1, %dl
; X64-NEXT: sbbq %rcx, %rsi
; X64-NEXT: movq %rsi, %rdx
; X64-NEXT: retq
;
; X86-LABEL: test_sub_i128:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -21,8 +24,14 @@ define i128 @test_sub_i128(i128 %arg1, i128 %arg2) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: setb %bl
+; X86-NEXT: cmpb $1, %bl
; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: setb %bl
+; X86-NEXT: cmpb $1, %bl
; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: setb %bl
+; X86-NEXT: cmpb $1, %bl
; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl %ecx, (%eax)
; X86-NEXT: movl %edx, 4(%eax)
@@ -30,6 +39,7 @@ define i128 @test_sub_i128(i128 %arg1, i128 %arg2) nounwind {
; X86-NEXT: movl %edi, 12(%eax)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
; X86-NEXT: retl
%ret = sub i128 %arg1, %arg2
ret i128 %ret
@@ -47,6 +57,8 @@ define i64 @test_sub_i64(i64 %arg1, i64 %arg2) {
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: subl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: setb %cl
+; X86-NEXT: cmpb $1, %cl
; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx
; X86-NEXT: retl
%ret = sub i64 %arg1, %arg2
diff --git a/llvm/test/CodeGen/X86/absolute-symbol-kernel-code-model.ll b/llvm/test/CodeGen/X86/absolute-symbol-kernel-code-model.ll
new file mode 100644
index 0000000..ce7024d
--- /dev/null
+++ b/llvm/test/CodeGen/X86/absolute-symbol-kernel-code-model.ll
@@ -0,0 +1,34 @@
+; RUN: llc --code-model=kernel < %s -asm-verbose=0 | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK-LABEL: func_no_abs_sym
+define i64 @func_no_abs_sym() nounwind {
+ ; CHECK: movq $no_abs_sym, %rax
+ %1 = ptrtoint ptr @no_abs_sym to i64
+ ret i64 %1
+}
+
+; CHECK-LABEL: func_abs_sym
+define i64 @func_abs_sym() nounwind {
+ ; CHECK: movabsq $abs_sym, %rax
+ %1 = ptrtoint ptr @abs_sym to i64
+ ret i64 %1
+}
+
+; CHECK-LABEL: func_abs_sym_in_range
+define i64 @func_abs_sym_in_range() nounwind {
+ ;; The absolute_symbol range fits in 32 bits but we still use movabs
+ ;; since there's no benefit to using the sign extending instruction
+ ;; with absolute symbols.
+ ; CHECK: movabsq $abs_sym_in_range, %rax
+ %1 = ptrtoint ptr @abs_sym_in_range to i64
+ ret i64 %1
+}
+
+@no_abs_sym = external hidden global [0 x i8]
+@abs_sym = external hidden global [0 x i8], !absolute_symbol !0
+@abs_sym_in_range = external hidden global [0 x i8], !absolute_symbol !1
+
+!0 = !{i64 -1, i64 -1} ;; Full range
+!1 = !{i64 -2147483648, i64 2147483648} ;; In range
diff --git a/llvm/test/CodeGen/X86/apx/cf.ll b/llvm/test/CodeGen/X86/apx/cf.ll
index b2651e9..de9caa5 100644
--- a/llvm/test/CodeGen/X86/apx/cf.ll
+++ b/llvm/test/CodeGen/X86/apx/cf.ll
@@ -230,6 +230,24 @@ entry:
ret void
}
+define void @and_cond(i32 %a, i1 %b) {
+; CHECK-LABEL: and_cond:
+; CHECK: # %bb.0:
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: setg %al
+; CHECK-NEXT: notb %sil
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: testb %al, %sil
+; CHECK-NEXT: cfcmovnel %ecx, 0
+; CHECK-NEXT: retq
+ %is_pos = icmp sgt i32 %a, 0
+ %not_b = xor i1 %b, true
+ %cond = and i1 %not_b, %is_pos
+ %mask = insertelement <1 x i1> zeroinitializer, i1 %cond, i64 0
+ call void @llvm.masked.store.v1i32.p0(<1 x i32> zeroinitializer, ptr null, i32 1, <1 x i1> %mask)
+ ret void
+}
+
define i64 @redundant_test(i64 %num, ptr %p1, i64 %in) {
; CHECK-LABEL: redundant_test:
; CHECK: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll
index 0de308a..5152c005 100644
--- a/llvm/test/CodeGen/X86/avg.ll
+++ b/llvm/test/CodeGen/X86/avg.ll
@@ -728,45 +728,70 @@ define void @avg_v32i8_2(ptr %a, ptr %b) nounwind {
define void @avg_v64i8_2(ptr %a, ptr %b) nounwind {
; SSE2-LABEL: avg_v64i8_2:
; SSE2: # %bb.0:
-; SSE2-NEXT: movaps (%rsi), %xmm0
-; SSE2-NEXT: movaps 16(%rsi), %xmm1
-; SSE2-NEXT: movaps 32(%rsi), %xmm2
-; SSE2-NEXT: movaps 48(%rsi), %xmm3
-; SSE2-NEXT: movups %xmm3, (%rax)
-; SSE2-NEXT: movups %xmm2, (%rax)
-; SSE2-NEXT: movups %xmm1, (%rax)
-; SSE2-NEXT: movups %xmm0, (%rax)
+; SSE2-NEXT: movdqa (%rdi), %xmm0
+; SSE2-NEXT: movdqa 16(%rdi), %xmm1
+; SSE2-NEXT: movdqa 32(%rdi), %xmm2
+; SSE2-NEXT: movdqa 48(%rdi), %xmm3
+; SSE2-NEXT: pavgb (%rsi), %xmm0
+; SSE2-NEXT: pavgb 16(%rsi), %xmm1
+; SSE2-NEXT: pavgb 32(%rsi), %xmm2
+; SSE2-NEXT: pavgb 48(%rsi), %xmm3
+; SSE2-NEXT: movdqu %xmm3, (%rax)
+; SSE2-NEXT: movdqu %xmm2, (%rax)
+; SSE2-NEXT: movdqu %xmm1, (%rax)
+; SSE2-NEXT: movdqu %xmm0, (%rax)
; SSE2-NEXT: retq
;
; AVX1-LABEL: avg_v64i8_2:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps (%rsi), %ymm0
-; AVX1-NEXT: vmovaps 32(%rsi), %ymm1
-; AVX1-NEXT: vmovups %ymm1, (%rax)
-; AVX1-NEXT: vmovups %ymm0, (%rax)
-; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX1-NEXT: vpavgb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpavgb 32(%rsi), %xmm2, %xmm2
+; AVX1-NEXT: vpavgb 48(%rsi), %xmm3, %xmm3
+; AVX1-NEXT: vmovdqu %xmm3, (%rax)
+; AVX1-NEXT: vmovdqu %xmm2, (%rax)
+; AVX1-NEXT: vmovdqu %xmm1, (%rax)
+; AVX1-NEXT: vmovdqu %xmm0, (%rax)
; AVX1-NEXT: retq
;
; AVX2-LABEL: avg_v64i8_2:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovaps (%rsi), %ymm0
-; AVX2-NEXT: vmovaps 32(%rsi), %ymm1
-; AVX2-NEXT: vmovups %ymm1, (%rax)
-; AVX2-NEXT: vmovups %ymm0, (%rax)
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0
+; AVX2-NEXT: vpavgb 32(%rsi), %ymm1, %ymm1
+; AVX2-NEXT: vmovdqu %ymm1, (%rax)
+; AVX2-NEXT: vmovdqu %ymm0, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512-LABEL: avg_v64i8_2:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovaps (%rsi), %zmm0
-; AVX512-NEXT: vmovups %zmm0, (%rax)
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512F-LABEL: avg_v64i8_2:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512F-NEXT: vpavgb (%rsi), %ymm0, %ymm0
+; AVX512F-NEXT: vpavgb 32(%rsi), %ymm1, %ymm1
+; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
+; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: avg_v64i8_2:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vpavgb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%1 = load <64 x i8>, ptr %a
%2 = load <64 x i8>, ptr %b
%3 = zext <64 x i8> %1 to <64 x i32>
%4 = zext <64 x i8> %2 to <64 x i32>
- %5 = add nuw nsw <64 x i32> %4, %4
+ %5 = add nuw nsw <64 x i32> %3, %4
%6 = add nuw nsw <64 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%8 = trunc <64 x i32> %7 to <64 x i8>
@@ -774,7 +799,6 @@ define void @avg_v64i8_2(ptr %a, ptr %b) nounwind {
ret void
}
-
define void @avg_v4i16_2(ptr %a, ptr %b) nounwind {
; SSE2-LABEL: avg_v4i16_2:
; SSE2: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/avx-shift.ll b/llvm/test/CodeGen/X86/avx-shift.ll
index c9c09d7..3bce843 100644
--- a/llvm/test/CodeGen/X86/avx-shift.ll
+++ b/llvm/test/CodeGen/X86/avx-shift.ll
@@ -201,7 +201,7 @@ define <8 x i32> @vshift08_add(<8 x i32> %a, <8 x i32> %y) {
define <4 x i32> @vshift13(<4 x i32> %in) {
; CHECK-LABEL: vshift13:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,16]
; CHECK-NEXT: retq
%T = shl <4 x i32> %in, <i32 0, i32 1, i32 2, i32 4>
ret <4 x i32> %T
diff --git a/llvm/test/CodeGen/X86/avx2-arith.ll b/llvm/test/CodeGen/X86/avx2-arith.ll
index 70b3b99..1133cdfd 100644
--- a/llvm/test/CodeGen/X86/avx2-arith.ll
+++ b/llvm/test/CodeGen/X86/avx2-arith.ll
@@ -199,12 +199,12 @@ define <8 x i32> @mul_const5(<8 x i32> %x) {
define <8 x i32> @mul_const6(<8 x i32> %x) {
; X86-LABEL: mul_const6:
; X86: # %bb.0:
-; X86-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
+; X86-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # [0,0,0,2,0,2,0,0]
; X86-NEXT: retl
;
; X64-LABEL: mul_const6:
; X64: # %bb.0:
-; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,0,0,2,0,2,0,0]
; X64-NEXT: retq
%y = mul <8 x i32> %x, <i32 0, i32 0, i32 0, i32 2, i32 0, i32 2, i32 0, i32 0>
ret <8 x i32> %y
diff --git a/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll b/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll
index 2aea9c1..632d90d 100644
--- a/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll
+++ b/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll
@@ -27,7 +27,7 @@ entry:
!1 = !{i64 0, !"_ZTSFivE.generalized"}
!2 = !{i64 0, !"_ZTSFviE.generalized"}
-; CHECK: .section .callgraph,"o",@progbits,.text
+; CHECK: .section .llvm.callgraph,"o",@progbits,.text
;; Version
; CHECK-NEXT: .byte 0
;; Flags -- Potential indirect target so LSB is set to 1. Other bits are 0.
diff --git a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll
index 1aabf66..ed6849a 100644
--- a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll
+++ b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll
@@ -1,8 +1,8 @@
;; Test if temporary labels are generated for each indirect callsite.
-;; Test if the .callgraph section contains the MD5 hash of callees' type (type id)
+;; Test if the .llvm.callgraph section contains the MD5 hash of callees' type (type id)
;; is correctly paired with its corresponding temporary label generated for indirect
;; call sites annotated with !callee_type metadata.
-;; Test if the .callgraph section contains unique direct callees.
+;; Test if the .llvm.callgraph section contains unique direct callees.
; RUN: llc -mtriple=x86_64-unknown-linux --call-graph-section -o - < %s | FileCheck %s
@@ -36,7 +36,7 @@ entry:
!4 = !{!5}
!5 = !{i64 0, !"_ZTSFPvS_E.generalized"}
-; CHECK: .section .callgraph,"o",@progbits,.text
+; CHECK: .section .llvm.callgraph,"o",@progbits,.text
;; Version
; CHECK-NEXT: .byte 0
;; Flags
diff --git a/llvm/test/CodeGen/X86/call-graph-section-tailcall.ll b/llvm/test/CodeGen/X86/call-graph-section-tailcall.ll
index 34dc5b8..49cc335 100644
--- a/llvm/test/CodeGen/X86/call-graph-section-tailcall.ll
+++ b/llvm/test/CodeGen/X86/call-graph-section-tailcall.ll
@@ -1,7 +1,10 @@
-;; Tests that we store the type identifiers in .callgraph section of the object file for tailcalls.
+;; Tests that we store the type identifiers in .llvm.callgraph section of the object file for tailcalls.
+
+; REQUIRES: x86-registered-target
+; REQUIRES: arm-registered-target
; RUN: llc -mtriple=x86_64-unknown-linux --call-graph-section -filetype=obj -o - < %s | \
-; RUN: llvm-readelf -x .callgraph - | FileCheck %s
+; RUN: llvm-readelf -x .llvm.callgraph - | FileCheck %s
define i32 @check_tailcall(ptr %func, i8 %x) !type !0 {
entry:
@@ -27,7 +30,7 @@ declare !type !2 i32 @bar(i8 signext)
!2 = !{i64 0, !"_ZTSFicE.generalized"}
!3 = !{i64 0, !"_ZTSFiiE.generalized"}
-; CHECK: Hex dump of section '.callgraph':
+; CHECK: Hex dump of section '.llvm.callgraph':
; CHECK-NEXT: 0x00000000 00050000 00000000 00008e19 0b7f3326
; CHECK-NEXT: 0x00000010 e3000154 86bc5981 4b8e3000 05000000
;; Verify that the type id 0x308e4b8159bc8654 is in section.
diff --git a/llvm/test/CodeGen/X86/call-graph-section.ll b/llvm/test/CodeGen/X86/call-graph-section.ll
index c144a24..8a1c6ca 100644
--- a/llvm/test/CodeGen/X86/call-graph-section.ll
+++ b/llvm/test/CodeGen/X86/call-graph-section.ll
@@ -1,7 +1,10 @@
-;; Tests that we store the type identifiers in .callgraph section of the object file.
+;; Tests that we store the type identifiers in .llvm.callgraph section of the object file.
+
+; REQUIRES: x86-registered-target
+; REQUIRES: arm-registered-target
; RUN: llc -mtriple=x86_64-unknown-linux --call-graph-section -filetype=obj -o - < %s | \
-; RUN: llvm-readelf -x .callgraph - | FileCheck %s
+; RUN: llvm-readelf -x .llvm.callgraph - | FileCheck %s
declare !type !0 void @foo()
@@ -31,7 +34,7 @@ entry:
;; Make sure following type IDs are in call graph section
;; 0x5eecb3e2444f731f, 0x814b8e305486bc59, 0xf897fd777ade6814
-; CHECK: Hex dump of section '.callgraph':
+; CHECK: Hex dump of section '.llvm.callgraph':
; CHECK-NEXT: 0x00000000 00050000 00000000 00000000 00000000
; CHECK-NEXT: 0x00000010 00000324 44f731f5 eecb3e54 86bc5981
; CHECK-NEXT: 0x00000020 4b8e307a de6814f8 97fd77
diff --git a/llvm/test/CodeGen/X86/combine-mul.ll b/llvm/test/CodeGen/X86/combine-mul.ll
index ae4d24f..29c41ca 100644
--- a/llvm/test/CodeGen/X86/combine-mul.ll
+++ b/llvm/test/CodeGen/X86/combine-mul.ll
@@ -66,7 +66,7 @@ define <4 x i32> @combine_vec_mul_pow2a(<4 x i32> %x) {
define <4 x i32> @combine_vec_mul_pow2b(<4 x i32> %x) {
; SSE-LABEL: combine_vec_mul_pow2b:
; SSE: # %bb.0:
-; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,16]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_mul_pow2b:
@@ -120,12 +120,12 @@ define <4 x i32> @combine_vec_mul_negpow2a(<4 x i32> %x) {
define <4 x i32> @combine_vec_mul_negpow2b(<4 x i32> %x) {
; SSE-LABEL: combine_vec_mul_negpow2b:
; SSE: # %bb.0:
-; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4294967295,4294967294,4294967292,4294967280]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_mul_negpow2b:
; AVX: # %bb.0:
-; AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4294967295,4294967294,4294967292,4294967280]
; AVX-NEXT: retq
%1 = mul <4 x i32> %x, <i32 -1, i32 -2, i32 -4, i32 -16>
ret <4 x i32> %1
@@ -176,12 +176,12 @@ define <4 x i64> @combine_vec_mul_negpow2c(<4 x i64> %x) {
define <4 x i32> @combine_vec_mul_shl_const(<4 x i32> %x) {
; SSE-LABEL: combine_vec_mul_shl_const:
; SSE: # %bb.0:
-; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,12,1280,458752]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_mul_shl_const:
; AVX: # %bb.0:
-; AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2,12,1280,458752]
; AVX-NEXT: retq
%1 = shl <4 x i32> %x, <i32 1, i32 2, i32 8, i32 16>
%2 = mul <4 x i32> %1, <i32 1, i32 3, i32 5, i32 7>
@@ -193,7 +193,7 @@ define <4 x i32> @combine_vec_mul_shl_oneuse0(<4 x i32> %x, <4 x i32> %y) {
; SSE-LABEL: combine_vec_mul_shl_oneuse0:
; SSE: # %bb.0:
; SSE-NEXT: pmulld %xmm1, %xmm0
-; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,4,256,65536]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_mul_shl_oneuse0:
@@ -210,7 +210,7 @@ define <4 x i32> @combine_vec_mul_shl_oneuse1(<4 x i32> %x, <4 x i32> %y) {
; SSE-LABEL: combine_vec_mul_shl_oneuse1:
; SSE: # %bb.0:
; SSE-NEXT: pmulld %xmm1, %xmm0
-; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,4,256,65536]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_mul_shl_oneuse1:
@@ -226,7 +226,7 @@ define <4 x i32> @combine_vec_mul_shl_oneuse1(<4 x i32> %x, <4 x i32> %y) {
define <4 x i32> @combine_vec_mul_shl_multiuse0(<4 x i32> %x, <4 x i32> %y) {
; SSE-LABEL: combine_vec_mul_shl_multiuse0:
; SSE: # %bb.0:
-; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,4,256,65536]
; SSE-NEXT: pmulld %xmm0, %xmm1
; SSE-NEXT: paddd %xmm1, %xmm0
; SSE-NEXT: retq
@@ -246,7 +246,7 @@ define <4 x i32> @combine_vec_mul_shl_multiuse0(<4 x i32> %x, <4 x i32> %y) {
define <4 x i32> @combine_vec_mul_shl_multiuse1(<4 x i32> %x, <4 x i32> %y) {
; SSE-LABEL: combine_vec_mul_shl_multiuse1:
; SSE: # %bb.0:
-; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,4,256,65536]
; SSE-NEXT: pmulld %xmm0, %xmm1
; SSE-NEXT: paddd %xmm1, %xmm0
; SSE-NEXT: retq
@@ -268,13 +268,13 @@ define <4 x i32> @combine_vec_mul_shl_multiuse1(<4 x i32> %x, <4 x i32> %y) {
define <4 x i32> @combine_vec_mul_add(<4 x i32> %x) {
; SSE-LABEL: combine_vec_mul_add:
; SSE: # %bb.0:
-; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4,6,2,0]
; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_mul_add:
; AVX: # %bb.0:
-; AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4,6,2,0]
; AVX-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
%1 = add <4 x i32> %x, <i32 1, i32 2, i32 8, i32 16>
diff --git a/llvm/test/CodeGen/X86/combine-multiplies.ll b/llvm/test/CodeGen/X86/combine-multiplies.ll
index a5d9846..4bdf20d 100644
--- a/llvm/test/CodeGen/X86/combine-multiplies.ll
+++ b/llvm/test/CodeGen/X86/combine-multiplies.ll
@@ -142,9 +142,9 @@ define void @testCombineMultiplies_non_splat(<4 x i32> %v1) nounwind {
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [11,22,33,44]
; CHECK-NEXT: paddd %xmm0, %xmm1
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [22,33,44,55]
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [33,u,55,u]
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [242,726,1452,2420]
diff --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll
index 70335f8..ff5329c 100644
--- a/llvm/test/CodeGen/X86/combine-pmuldq.ll
+++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll
@@ -204,16 +204,16 @@ define i32 @PR43159(ptr %a0) {
; SSE: # %bb.0: # %entry
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [344322273,344322273,1916962805,1916962805]
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psrld $1, %xmm2
; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5],xmm0[6,7]
; SSE-NEXT: psubd %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,2147483648,2147483648]
; SSE-NEXT: paddd %xmm1, %xmm0
; SSE-NEXT: psrld $7, %xmm0
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1645975491,344322273,2164392969,1916962805]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
; SSE-NEXT: psrld $6, %xmm1
; SSE-NEXT: movd %xmm1, %edi
@@ -226,15 +226,15 @@ define i32 @PR43159(ptr %a0) {
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [344322273,344322273,1916962805,1916962805]
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [2147483648,2147483648,2147483648,2147483648]
; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpsrld $7, %xmm1, %xmm1
; AVX1-NEXT: vpsrld $1, %xmm0, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5],xmm0[6,7]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1645975491,344322273,2164392969,1916962805]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; AVX1-NEXT: vpsrld $6, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %edi
@@ -247,9 +247,9 @@ define i32 @PR43159(ptr %a0) {
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovdqa (%rdi), %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [344322273,u,1916962805,u]
; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [1645975491,344322273,2164392969,1916962805]
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
@@ -270,9 +270,9 @@ define i32 @PR43159(ptr %a0) {
; AVX512VL: # %bb.0: # %entry
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [344322273,u,1916962805,u]
; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [1645975491,344322273,2164392969,1916962805]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
; AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
@@ -293,9 +293,9 @@ define i32 @PR43159(ptr %a0) {
; AVX512DQVL: # %bb.0: # %entry
; AVX512DQVL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512DQVL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512DQVL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [344322273,u,1916962805,u]
; AVX512DQVL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX512DQVL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX512DQVL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [1645975491,344322273,2164392969,1916962805]
; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
; AVX512DQVL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/combine-rotates.ll b/llvm/test/CodeGen/X86/combine-rotates.ll
index 65d74c8..e7152ec 100644
--- a/llvm/test/CodeGen/X86/combine-rotates.ll
+++ b/llvm/test/CodeGen/X86/combine-rotates.ll
@@ -10,9 +10,9 @@ define <4 x i32> @combine_vec_rot_rot(<4 x i32> %x) {
; SSE2-LABEL: combine_vec_rot_rot:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [524288,131072,32768,8192]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [131072,u,8192,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll
index 6bcbfe1..f7baee9 100644
--- a/llvm/test/CodeGen/X86/combine-sdiv.ll
+++ b/llvm/test/CodeGen/X86/combine-sdiv.ll
@@ -2927,7 +2927,7 @@ define <16 x i8> @pr38658(<16 x i8> %x) {
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
-; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [0,0,0,0,0,0,0,37632]
+; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,147]
; SSE2-NEXT: psrlw $8, %xmm3
; SSE2-NEXT: packuswb %xmm3, %xmm1
; SSE2-NEXT: paddb %xmm1, %xmm0
@@ -2947,7 +2947,7 @@ define <16 x i8> @pr38658(<16 x i8> %x) {
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pxor %xmm2, %xmm2
; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,0,0,0,0,37632]
+; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,147]
; SSE41-NEXT: psrlw $8, %xmm2
; SSE41-NEXT: packuswb %xmm2, %xmm1
; SSE41-NEXT: paddb %xmm0, %xmm1
@@ -2971,7 +2971,7 @@ define <16 x i8> @pr38658(<16 x i8> %x) {
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,0,0,0,0,0,0,37632]
+; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,147]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm1
@@ -3044,7 +3044,7 @@ define <16 x i8> @pr38658(<16 x i8> %x) {
; XOP: # %bb.0:
; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
; XOP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; XOP-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,0,0,0,0,0,0,37632]
+; XOP-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,147]
; XOP-NEXT: vpperm {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15],xmm2[1,3,5,7,9,11,13,15]
; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; XOP-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
diff --git a/llvm/test/CodeGen/X86/combine-shl.ll b/llvm/test/CodeGen/X86/combine-shl.ll
index 1ce10c37..9548967 100644
--- a/llvm/test/CodeGen/X86/combine-shl.ll
+++ b/llvm/test/CodeGen/X86/combine-shl.ll
@@ -88,7 +88,7 @@ define <4 x i32> @combine_vec_shl_known_zero1(<4 x i32> %x) {
; SSE2-NEXT: pmuludq %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,u,8192,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE2-NEXT: movdqa %xmm1, %xmm0
@@ -97,7 +97,7 @@ define <4 x i32> @combine_vec_shl_known_zero1(<4 x i32> %x) {
; SSE41-LABEL: combine_vec_shl_known_zero1:
; SSE41: # %bb.0:
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [65536,32768,16384,8192]
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_known_zero1:
@@ -198,16 +198,16 @@ define <4 x i32> @combine_vec_shl_shl1(<4 x i32> %x) {
; SSE2-LABEL: combine_vec_shl_shl1:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,64,256,1024]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [64,u,1024,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: combine_vec_shl_shl1:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,64,256,1024]
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_shl1:
@@ -304,17 +304,17 @@ define <8 x i32> @combine_vec_shl_ext_shl2(<8 x i16> %x) {
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE2-NEXT: psrad $16, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [131072,524288,2097152,8388608]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [524288,u,8388608,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; SSE2-NEXT: psrad $16, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [33554432,134217728,536870912,2147483648]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [134217728,u,2147483648,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE2-NEXT: movdqa %xmm2, %xmm0
@@ -323,10 +323,10 @@ define <8 x i32> @combine_vec_shl_ext_shl2(<8 x i16> %x) {
; SSE41-LABEL: combine_vec_shl_ext_shl2:
; SSE41: # %bb.0:
; SSE41-NEXT: pmovsxwd %xmm0, %xmm2
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [131072,524288,2097152,8388608]
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovsxwd %xmm0, %xmm1
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [33554432,134217728,536870912,2147483648]
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
@@ -673,9 +673,9 @@ define <4 x i32> @combine_vec_shl_add1(<4 x i32> %x) {
; SSE2-LABEL: combine_vec_shl_add1:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,4,8,16]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4,u,16,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -683,7 +683,7 @@ define <4 x i32> @combine_vec_shl_add1(<4 x i32> %x) {
;
; SSE41-LABEL: combine_vec_shl_add1:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,4,8,16]
; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: retq
;
@@ -726,9 +726,9 @@ define <4 x i32> @combine_vec_shl_or1(<4 x i32> %x) {
; SSE2-LABEL: combine_vec_shl_or1:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,4,8,16]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4,u,16,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -736,7 +736,7 @@ define <4 x i32> @combine_vec_shl_or1(<4 x i32> %x) {
;
; SSE41-LABEL: combine_vec_shl_or1:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,4,8,16]
; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: retq
;
@@ -765,7 +765,7 @@ define <4 x i32> @combine_vec_shl_mul0(<4 x i32> %x) {
;
; SSE41-LABEL: combine_vec_shl_mul0:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [20,20,20,20]
; SSE41-NEXT: retq
;
; AVX2-LABEL: combine_vec_shl_mul0:
@@ -787,21 +787,21 @@ define <4 x i32> @combine_vec_shl_mul1(<4 x i32> %x) {
; SSE2-LABEL: combine_vec_shl_mul1:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [10,24,56,128]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [24,u,128,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: combine_vec_shl_mul1:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [10,24,56,128]
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_mul1:
; AVX: # %bb.0:
-; AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [10,24,56,128]
; AVX-NEXT: retq
%1 = mul <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
%2 = shl <4 x i32> %1, <i32 1, i32 2, i32 3, i32 4>
@@ -813,9 +813,9 @@ define <4 x i32> @combine_vec_add_shl_nonsplat(<4 x i32> %a0) {
; SSE2-LABEL: combine_vec_add_shl_nonsplat:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4,8,16,32]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [8,u,32,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -823,7 +823,7 @@ define <4 x i32> @combine_vec_add_shl_nonsplat(<4 x i32> %a0) {
;
; SSE41-LABEL: combine_vec_add_shl_nonsplat:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4,8,16,32]
; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: retq
;
@@ -852,7 +852,7 @@ define <4 x i32> @combine_vec_add_shl_and_nonsplat(<4 x i32> %a0) {
; SSE2-NEXT: pmuludq %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [8,u,32,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -863,7 +863,7 @@ define <4 x i32> @combine_vec_add_shl_and_nonsplat(<4 x i32> %a0) {
; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4,8,16,32]
; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/combine-srem.ll b/llvm/test/CodeGen/X86/combine-srem.ll
index 4b01c16..0ca79ad 100644
--- a/llvm/test/CodeGen/X86/combine-srem.ll
+++ b/llvm/test/CodeGen/X86/combine-srem.ll
@@ -272,7 +272,7 @@ define <4 x i32> @combine_vec_srem_by_pow2b(<4 x i32> %x) {
; SSE-NEXT: psrad $2, %xmm2
; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
-; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1,2,4,8]
; SSE-NEXT: psubd %xmm2, %xmm0
; SSE-NEXT: retq
;
@@ -291,7 +291,7 @@ define <4 x i32> @combine_vec_srem_by_pow2b(<4 x i32> %x) {
; AVX1-NEXT: vpsrad $2, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,2,4,8]
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
@@ -336,7 +336,7 @@ define <4 x i32> @combine_vec_srem_by_pow2b_neg(<4 x i32> %x) {
; SSE-NEXT: psrld $1, %xmm1
; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967294,4294967292,4294967288,4294967280]
; SSE-NEXT: paddd %xmm1, %xmm0
; SSE-NEXT: retq
;
@@ -358,7 +358,7 @@ define <4 x i32> @combine_vec_srem_by_pow2b_neg(<4 x i32> %x) {
; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [4294967294,4294967292,4294967288,4294967280]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
@@ -368,7 +368,7 @@ define <4 x i32> @combine_vec_srem_by_pow2b_neg(<4 x i32> %x) {
; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [4294967294,4294967292,4294967288,4294967280]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
%1 = srem <4 x i32> %x, <i32 -2, i32 -4, i32 -8, i32 -16>
diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll
index 5571519..233735d 100644
--- a/llvm/test/CodeGen/X86/combine-udiv.ll
+++ b/llvm/test/CodeGen/X86/combine-udiv.ll
@@ -502,11 +502,11 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) {
; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [25645,61681,8195,9363,512,32769,32897,2]
; SSE2-NEXT: psubw %xmm1, %xmm0
-; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,0,0,0]
; SSE2-NEXT: paddw %xmm1, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,0]
; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,16,0,8,8,0,0,0,0,0,2,0,2,0,0,0]
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
@@ -517,7 +517,7 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) {
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [25645,61681,8195,9363,512,32769,32897,2]
; SSE41-NEXT: psubw %xmm1, %xmm0
-; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,0,0,0]
; SSE41-NEXT: paddw %xmm1, %xmm0
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4096,2048,8,u,u,2,2,u]
; SSE41-NEXT: pmulhuw %xmm0, %xmm1
@@ -530,7 +530,7 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) {
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [25645,61681,8195,9363,512,32769,32897,2]
; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,0,0,0]
; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [4096,2048,8,u,u,2,2,u]
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6],xmm0[7]
@@ -541,7 +541,7 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) {
; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [25645,61681,8195,9363,512,32769,32897,2]
; XOP-NEXT: vpsubw %xmm1, %xmm0, %xmm0
-; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,0,0,0]
; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
@@ -630,7 +630,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [171,0,0,0]
; SSE2-NEXT: psrlw $15, %xmm0
; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: por %xmm2, %xmm1
@@ -641,7 +641,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [171,0,0,0]
; SSE41-NEXT: psrlw $8, %xmm2
; SSE41-NEXT: packuswb %xmm2, %xmm2
; SSE41-NEXT: psrlw $7, %xmm2
@@ -654,7 +654,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
; AVX-LABEL: combine_vec_udiv_nonuniform4:
; AVX: # %bb.0:
; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [171,0,0,0]
; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX-NEXT: vpackuswb %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpsrlw $7, %xmm1, %xmm1
@@ -665,14 +665,12 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
;
; XOP-LABEL: combine_vec_udiv_nonuniform4:
; XOP: # %bb.0:
-; XOP-NEXT: movl $171, %eax
+; XOP-NEXT: movl $249, %eax
; XOP-NEXT: vmovd %eax, %xmm1
; XOP-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; XOP-NEXT: vpmullw %xmm1, %xmm2, %xmm1
-; XOP-NEXT: vpsrlw $8, %xmm1, %xmm1
-; XOP-NEXT: movl $249, %eax
-; XOP-NEXT: vmovd %eax, %xmm2
-; XOP-NEXT: vpshlb %xmm2, %xmm1, %xmm1
+; XOP-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [171,0,0,0]
+; XOP-NEXT: vpsrlw $8, %xmm2, %xmm2
+; XOP-NEXT: vpshlb %xmm1, %xmm2, %xmm1
; XOP-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615]
; XOP-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
; XOP-NEXT: retq
@@ -691,7 +689,7 @@ define <8 x i16> @pr38477(<8 x i16> %a0) {
; SSE2-NEXT: psubw %xmm3, %xmm0
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [u,32768,0,0,0,0,0,32768]
; SSE2-NEXT: paddw %xmm3, %xmm0
-; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [u,u,0,4,0,4,16,0,4,0,0,4,0,0,0,16]
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: pand %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/combine-urem.ll b/llvm/test/CodeGen/X86/combine-urem.ll
index 715d5c7..34c7d3d 100644
--- a/llvm/test/CodeGen/X86/combine-urem.ll
+++ b/llvm/test/CodeGen/X86/combine-urem.ll
@@ -327,7 +327,7 @@ define <4 x i32> @combine_vec_urem_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) {
; SSE-NEXT: pslld $23, %xmm1
; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,4,8,16]
; SSE-NEXT: pcmpeqd %xmm2, %xmm2
; SSE-NEXT: paddd %xmm1, %xmm2
; SSE-NEXT: pand %xmm2, %xmm0
@@ -338,7 +338,7 @@ define <4 x i32> @combine_vec_urem_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) {
; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,4,8,16]
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/cpus-intel.ll b/llvm/test/CodeGen/X86/cpus-intel.ll
index 40c38c2..646629d 100644
--- a/llvm/test/CodeGen/X86/cpus-intel.ll
+++ b/llvm/test/CodeGen/X86/cpus-intel.ll
@@ -38,6 +38,8 @@
; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=lunarlake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=gracemont 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=pantherlake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=wildcatlake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=novalake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=clearwaterforest 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=diamondrapids 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
@@ -104,6 +106,8 @@
; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=lunarlake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=gracemont 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pantherlake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=wildcatlake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=novalake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=clearwaterforest 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=diamondrapids 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
diff --git a/llvm/test/CodeGen/X86/dagcombine-shifts.ll b/llvm/test/CodeGen/X86/dagcombine-shifts.ll
index 345b2b9..19b9452 100644
--- a/llvm/test/CodeGen/X86/dagcombine-shifts.ll
+++ b/llvm/test/CodeGen/X86/dagcombine-shifts.ll
@@ -437,9 +437,9 @@ define <4 x i32> @shift_zext_shl2_vec(<4 x i8> %x) nounwind {
; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X64-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [512,256,128,64]
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [256,u,64,u]
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll
index df97f49..252cb33 100644
--- a/llvm/test/CodeGen/X86/funnel-shift.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift.ll
@@ -574,9 +574,9 @@ define <4 x i32> @fshl_v4i32_undef1_cst(<4 x i32> %a0) nounwind {
; X86-SSE2-LABEL: fshl_v4i32_undef1_cst:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [512,1024,2048,4096]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [1024,u,4096,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-SSE2-NEXT: retl
@@ -746,9 +746,9 @@ define <4 x i32> @fshr_v4i32_undef1_cst(<4 x i32> %a0) nounwind {
; X86-SSE2-LABEL: fshr_v4i32_undef1_cst:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [8388608,4194304,2097152,1048576]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [4194304,u,1048576,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-SSE2-NEXT: retl
diff --git a/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll b/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll
index a0c243b..b2b0a6d 100644
--- a/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll
+++ b/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll
@@ -1,18 +1,101 @@
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
-;; A minimal test case. llc will crash if global variables already has a section
-;; prefix. Subsequent PRs will expand on this test case to test the hotness
-;; reconciliation implementation.
+;; Requires asserts for -debug-only.
+; REQUIRES: asserts
-; RUN: not llc -mtriple=x86_64-unknown-linux-gnu -relocation-model=pic \
+; RUN: rm -rf %t && split-file %s %t && cd %t
+
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -relocation-model=pic \
+; RUN: -partition-static-data-sections=true \
+; RUN: -debug-only=static-data-profile-info \
+; RUN: -data-sections=true -unique-section-names=false \
+; RUN: input-with-data-access-prof-on.ll -o - 2>&1 | FileCheck %s --check-prefixes=LOG,IR
+
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -relocation-model=pic \
; RUN: -partition-static-data-sections=true \
+; RUN: -debug-only=static-data-profile-info \
; RUN: -data-sections=true -unique-section-names=false \
-; RUN: %s -o - 2>&1 | FileCheck %s --check-prefix=ERR
+; RUN: input-with-data-access-prof-off.ll -o - 2>&1 | FileCheck %s --check-prefixes=OFF
+
+; LOG: hot_bss has section prefix hot, the max from data access profiles as hot and PGO counters as hot
+; LOG: data_unknown_hotness has section prefix <empty>, the max from data access profiles as <empty> and PGO counters as unlikely
+; LOG: external_relro_array has section prefix unlikely, solely from data access profiles
+
+; IR: .type hot_bss,@object
+; IR-NEXT: .section .bss.hot.,"aw"
+; IR: .type data_unknown_hotness,@object
+; IR-NEXT: .section .data,"aw"
+; IR: .type external_relro_array,@object
+; IR-NEXT: .section .data.rel.ro.unlikely.,"aw"
+
+
+; OFF: .type hot_bss,@object
+; OFF-NEXT: .section .bss.hot.,"aw"
+; OFF: .type data_unknown_hotness,@object
+; OFF-NEXT: .section .data.unlikely.,"aw"
+;; Global variable section prefix metadata is not used when
+;; module flag `EnableDataAccessProf` is 0, and @external_relro_array has
+;; external linkage, so analysis based on PGO counters doesn't apply.
+; OFF: .type external_relro_array,@object # @external_relro_array
+; OFF-NEXT: .section .data.rel.ro,"aw"
+
+;--- input-with-data-access-prof-on.ll
+; Internal vars
+@hot_bss = internal global i32 0, !section_prefix !17
+@data_unknown_hotness = internal global i32 1
+; External vars
+@external_relro_array = constant [2 x ptr] [ptr @hot_bss, ptr @data_unknown_hotness], !section_prefix !18
+
+define void @cold_func() !prof !15 {
+ %9 = load i32, ptr @data_unknown_hotness
+ %11 = call i32 (...) @func_taking_arbitrary_param(i32 %9)
+ ret void
+}
+
+define void @hot_func() !prof !14 {
+ %9 = load i32, ptr @hot_bss
+ %11 = call i32 (...) @func_taking_arbitrary_param(i32 %9)
+ ret void
+}
+
+declare i32 @func_taking_arbitrary_param(...)
-; ERR: Global variable hot_bss already has a section prefix hot
+!llvm.module.flags = !{!0, !1}
+!0 = !{i32 2, !"EnableDataAccessProf", i32 1}
+!1 = !{i32 1, !"ProfileSummary", !2}
+!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
+!3 = !{!"ProfileFormat", !"InstrProf"}
+!4 = !{!"TotalCount", i64 1460183}
+!5 = !{!"MaxCount", i64 849024}
+!6 = !{!"MaxInternalCount", i64 32769}
+!7 = !{!"MaxFunctionCount", i64 849024}
+!8 = !{!"NumCounts", i64 23627}
+!9 = !{!"NumFunctions", i64 3271}
+!10 = !{!"DetailedSummary", !11}
+!11 = !{!12, !13}
+!12 = !{i32 990000, i64 166, i32 73}
+!13 = !{i32 999999, i64 3, i32 1443}
+!14 = !{!"function_entry_count", i64 100000}
+!15 = !{!"function_entry_count", i64 1}
+!16 = !{!"branch_weights", i32 1, i32 99999}
+!17 = !{!"section_prefix", !"hot"}
+!18 = !{!"section_prefix", !"unlikely"}
+
+;--- input-with-data-access-prof-off.ll
+; Same as file above except that module flag `EnableDataAccessProf` has value 0.
+; Internal vars
@hot_bss = internal global i32 0, !section_prefix !17
+@data_unknown_hotness = internal global i32 1
+; External vars
+@external_relro_array = constant [2 x ptr] [ptr @hot_bss, ptr @data_unknown_hotness], !section_prefix !18
+
+define void @cold_func() !prof !15 {
+ %9 = load i32, ptr @data_unknown_hotness
+ %11 = call i32 (...) @func_taking_arbitrary_param(i32 %9)
+ ret void
+}
define void @hot_func() !prof !14 {
%9 = load i32, ptr @hot_bss
@@ -22,8 +105,9 @@ define void @hot_func() !prof !14 {
declare i32 @func_taking_arbitrary_param(...)
-!llvm.module.flags = !{!1}
+!llvm.module.flags = !{!0, !1}
+!0 = !{i32 2, !"EnableDataAccessProf", i32 0}
!1 = !{i32 1, !"ProfileSummary", !2}
!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
!3 = !{!"ProfileFormat", !"InstrProf"}
@@ -41,3 +125,4 @@ declare i32 @func_taking_arbitrary_param(...)
!15 = !{!"function_entry_count", i64 1}
!16 = !{!"branch_weights", i32 1, i32 99999}
!17 = !{!"section_prefix", !"hot"}
+!18 = !{!"section_prefix", !"unlikely"}
diff --git a/llvm/test/CodeGen/X86/global-variable-partition.ll b/llvm/test/CodeGen/X86/global-variable-partition.ll
index ce06d17..604b4fd 100644
--- a/llvm/test/CodeGen/X86/global-variable-partition.ll
+++ b/llvm/test/CodeGen/X86/global-variable-partition.ll
@@ -106,23 +106,31 @@ target triple = "x86_64-unknown-linux-gnu"
; UNIQ-NEXT: .section .data.unlikely.,"aw",@progbits,unique,8
; AGG-NEXT: .section .data.unlikely.,"aw",@progbits
+;; The `.section` directive is omitted for .data with -unique-section-names=false.
+; See MCSectionELF::shouldOmitSectionDirective for the implementation details.
+
; For @data_with_unknown_hotness
; SYM: .type .Ldata_with_unknown_hotness,@object # @data_with_unknown_hotness
; SYM: .section .data..Ldata_with_unknown_hotness,"aw",@progbits
; UNIQ: .section .data,"aw",@progbits,unique,9
-; The `.section` directive is omitted for .data with -unique-section-names=false.
-; See MCSectionELF::shouldOmitSectionDirective for the implementation details.
+
; AGG: .data
; COMMON: .Ldata_with_unknown_hotness:
-; For @hot_data_custom_bar_section
-; It has an explicit section attribute 'var' and shouldn't have hot or unlikely suffix.
+; For variables that are not eligible for section prefix annotation
; COMMON: .type hot_data_custom_bar_section,@object
; SYM-NEXT: .section bar,"aw",@progbits
; SYM: hot_data_custom_bar_section
; UNIQ: .section bar,"aw",@progbits
; AGG: .section bar,"aw",@progbits
+; SYM: .section .data.llvm.fake_var,"aw"
+; UNIQ: .section .data,"aw"
+; AGG: .data
+
+;; No section for linker declaration
+; COMMON-NOT: qux
+
@.str = private unnamed_addr constant [5 x i8] c"hot\09\00", align 1
@.str.1 = private unnamed_addr constant [10 x i8] c"%d\09%d\09%d\0A\00", align 1
@hot_relro_array = internal constant [2 x ptr] [ptr @bss2, ptr @data3]
@@ -137,6 +145,8 @@ target triple = "x86_64-unknown-linux-gnu"
@data3 = internal global i32 3
@data_with_unknown_hotness = private global i32 5
@hot_data_custom_bar_section = internal global i32 101 #0
+@llvm.fake_var = internal global i32 123
+@qux = external global i64
define void @cold_func(i32 %0) !prof !15 {
%2 = load i32, ptr @cold_bss
diff --git a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
index 1a2aac6..b45d01e 100644
--- a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
+++ b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
@@ -499,9 +499,9 @@ define <4 x i1> @vec_4xi32_nonsplat_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [0,1,16776960,2147483648]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [1,u,2147483648,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; X86-SSE2-NEXT: pand %xmm1, %xmm0
@@ -524,9 +524,9 @@ define <4 x i1> @vec_4xi32_nonsplat_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; X64-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,1,16776960,2147483648]
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1,u,2147483648,u]
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; X64-SSE2-NEXT: pand %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/known-pow2.ll b/llvm/test/CodeGen/X86/known-pow2.ll
index e183bbc..019bca7 100644
--- a/llvm/test/CodeGen/X86/known-pow2.ll
+++ b/llvm/test/CodeGen/X86/known-pow2.ll
@@ -28,16 +28,16 @@ define <4 x i32> @pow2_non_splat_vec_fail0(<4 x i32> %x) {
; CHECK-NEXT: pmuludq %xmm0, %xmm1
; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1073741824,u,67108864,u]
; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3]
; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; CHECK-NEXT: movdqa %xmm1, %xmm3
; CHECK-NEXT: psrld $1, %xmm3
; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3]
-; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [9,4,16,64]
; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [4,u,64,u]
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; CHECK-NEXT: psubd %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll
index bdb7c30..2a2a4a5 100644
--- a/llvm/test/CodeGen/X86/madd.ll
+++ b/llvm/test/CodeGen/X86/madd.ll
@@ -2057,10 +2057,10 @@ define <4 x i32> @pmaddwd_negative2(<8 x i16> %A) {
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; SSE2-NEXT: psrad $16, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294934528,0,0,0]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; SSE2-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,0,7,0,42,0,32,0]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [32768,4294934528,0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[0,2]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
; SSE2-NEXT: paddd %xmm2, %xmm1
@@ -2071,15 +2071,15 @@ define <4 x i32> @pmaddwd_negative2(<8 x i16> %A) {
; AVX1: # %bb.0:
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; AVX1-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,7,42,32]
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32768,4294934528,0,0]
; AVX1-NEXT: vphaddd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX256-LABEL: pmaddwd_negative2:
; AVX256: # %bb.0:
; AVX256-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX256-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX256-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [32768,4294934528,0,0,1,7,42,32]
; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX256-NEXT: vphaddd %xmm1, %xmm0, %xmm0
; AVX256-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
index 4cde581..caec02e 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
@@ -4765,6 +4765,66 @@ define void @scaleidx_scatter_outofrange(<8 x float> %value, ptr %base, <8 x i32
}
declare void @llvm.masked.scatter.v8f32.v8p0(<8 x float>, <8 x ptr>, i32 immarg, <8 x i1>)
+define <16 x i32> @pr163023_sext(ptr %a0, <16 x i32> %a1) {
+; X64-LABEL: pr163023_sext:
+; X64: # %bb.0:
+; X64-NEXT: kxnorw %k0, %k0, %k1
+; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X64-NEXT: vpgatherdd (%rdi,%zmm0), %zmm1 {%k1}
+; X64-NEXT: vmovdqa64 %zmm1, %zmm0
+; X64-NEXT: retq
+;
+; X86-LABEL: pr163023_sext:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: kxnorw %k0, %k0, %k1
+; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X86-NEXT: vpgatherdd (%eax,%zmm0), %zmm1 {%k1}
+; X86-NEXT: vmovdqa64 %zmm1, %zmm0
+; X86-NEXT: retl
+ %addr.p = ptrtoint ptr %a0 to i64
+ %addr.v = insertelement <1 x i64> poison, i64 %addr.p, i64 0
+ %addr.splat = shufflevector <1 x i64> %addr.v, <1 x i64> poison, <16 x i32> zeroinitializer
+ %ofs = sext <16 x i32> %a1 to <16 x i64>
+ %addr = add nuw <16 x i64> %addr.splat, %ofs
+ %ptr = inttoptr <16 x i64> %addr to <16 x ptr>
+ %gather = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> %ptr, i32 4, <16 x i1> splat (i1 true), <16 x i32> poison)
+ ret <16 x i32> %gather
+}
+
+define <16 x i32> @pr163023_zext(ptr %a0, <16 x i32> %a1) {
+; X64-LABEL: pr163023_zext:
+; X64: # %bb.0:
+; X64-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; X64-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; X64-NEXT: kxnorw %k0, %k0, %k1
+; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; X64-NEXT: kxnorw %k0, %k0, %k2
+; X64-NEXT: vpgatherqd (%rdi,%zmm0), %ymm3 {%k2}
+; X64-NEXT: vpgatherqd (%rdi,%zmm1), %ymm2 {%k1}
+; X64-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0
+; X64-NEXT: retq
+;
+; X86-LABEL: pr163023_zext:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: kxnorw %k0, %k0, %k1
+; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X86-NEXT: vpgatherdd (%eax,%zmm0), %zmm1 {%k1}
+; X86-NEXT: vmovdqa64 %zmm1, %zmm0
+; X86-NEXT: retl
+ %addr.p = ptrtoint ptr %a0 to i64
+ %addr.v = insertelement <1 x i64> poison, i64 %addr.p, i64 0
+ %addr.splat = shufflevector <1 x i64> %addr.v, <1 x i64> poison, <16 x i32> zeroinitializer
+ %ofs = zext <16 x i32> %a1 to <16 x i64>
+ %addr = add nuw <16 x i64> %addr.splat, %ofs
+ %ptr = inttoptr <16 x i64> %addr to <16 x ptr>
+ %gather = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> %ptr, i32 4, <16 x i1> splat (i1 true), <16 x i32> poison)
+ ret <16 x i32> %gather
+}
+
;
; PR45906
; This used to cause fast-isel to generate bad copy instructions that would
diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
index d752659..04f0a65 100644
--- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll
+++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit | FileCheck %s --check-prefixes=CHECK,CHECK-SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit | FileCheck %s --check-prefixes=CHECK,CHECK-SKX,CHECK-SKX-NOVBMI
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit,avx512vbmi | FileCheck %s --check-prefixes=CHECK,CHECK-SKX,CHECK-SKX-VBMI
; Make sure CPUs default to prefer-256-bit. avx512vnni isn't interesting as it just adds an isel peephole for vpmaddwd+vpaddd
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512
@@ -883,6 +883,30 @@ define <16 x i16> @test_16f32tosb_512(ptr %ptr, <16 x i16> %passthru) "min-legal
}
define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="256" {
+; CHECK-SKX-NOVBMI-LABEL: mul256:
+; CHECK-SKX-NOVBMI: # %bb.0:
+; CHECK-SKX-NOVBMI-NEXT: vmovdqa (%rdi), %ymm0
+; CHECK-SKX-NOVBMI-NEXT: vmovdqa 32(%rdi), %ymm1
+; CHECK-SKX-NOVBMI-NEXT: vmovdqa (%rsi), %ymm2
+; CHECK-SKX-NOVBMI-NEXT: vmovdqa 32(%rsi), %ymm3
+; CHECK-SKX-NOVBMI-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-SKX-NOVBMI-NEXT: vpand %ymm4, %ymm3, %ymm5
+; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5
+; CHECK-SKX-NOVBMI-NEXT: vpandn %ymm3, %ymm4, %ymm3
+; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1
+; CHECK-SKX-NOVBMI-NEXT: vpsllw $8, %ymm1, %ymm1
+; CHECK-SKX-NOVBMI-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm5 & ymm4)
+; CHECK-SKX-NOVBMI-NEXT: vpand %ymm4, %ymm2, %ymm3
+; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3
+; CHECK-SKX-NOVBMI-NEXT: vpandn %ymm2, %ymm4, %ymm2
+; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0
+; CHECK-SKX-NOVBMI-NEXT: vpsllw $8, %ymm0, %ymm0
+; CHECK-SKX-NOVBMI-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm3 & ymm4)
+; CHECK-SKX-NOVBMI-NEXT: vmovdqa %ymm0, (%rdx)
+; CHECK-SKX-NOVBMI-NEXT: vmovdqa %ymm1, 32(%rdx)
+; CHECK-SKX-NOVBMI-NEXT: vzeroupper
+; CHECK-SKX-NOVBMI-NEXT: retq
+;
; CHECK-SKX-VBMI-LABEL: mul256:
; CHECK-SKX-VBMI: # %bb.0:
; CHECK-SKX-VBMI-NEXT: vmovdqa (%rdi), %ymm0
@@ -960,6 +984,21 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
}
define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="512" {
+; CHECK-SKX-NOVBMI-LABEL: mul512:
+; CHECK-SKX-NOVBMI: # %bb.0:
+; CHECK-SKX-NOVBMI-NEXT: vmovdqa64 (%rdi), %zmm0
+; CHECK-SKX-NOVBMI-NEXT: vmovdqa64 (%rsi), %zmm1
+; CHECK-SKX-NOVBMI-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-SKX-NOVBMI-NEXT: vpandq %zmm2, %zmm1, %zmm3
+; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3
+; CHECK-SKX-NOVBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm1
+; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0
+; CHECK-SKX-NOVBMI-NEXT: vpsllw $8, %zmm0, %zmm0
+; CHECK-SKX-NOVBMI-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm3 & zmm2)
+; CHECK-SKX-NOVBMI-NEXT: vmovdqa64 %zmm0, (%rdx)
+; CHECK-SKX-NOVBMI-NEXT: vzeroupper
+; CHECK-SKX-NOVBMI-NEXT: retq
+;
; CHECK-SKX-VBMI-LABEL: mul512:
; CHECK-SKX-VBMI: # %bb.0:
; CHECK-SKX-VBMI-NEXT: vmovdqa64 (%rdi), %zmm0
@@ -1137,6 +1176,14 @@ define <16 x i16> @trunc_v16i32_v16i16_zeroes(ptr %x) nounwind "min-legal-vector
}
define <32 x i8> @trunc_v32i16_v32i8_zeroes(ptr %x) nounwind "min-legal-vector-width"="256" {
+; CHECK-SKX-NOVBMI-LABEL: trunc_v32i16_v32i8_zeroes:
+; CHECK-SKX-NOVBMI: # %bb.0:
+; CHECK-SKX-NOVBMI-NEXT: vpsrlw $8, 32(%rdi), %ymm0
+; CHECK-SKX-NOVBMI-NEXT: vpsrlw $8, (%rdi), %ymm1
+; CHECK-SKX-NOVBMI-NEXT: vpackuswb %ymm0, %ymm1, %ymm0
+; CHECK-SKX-NOVBMI-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; CHECK-SKX-NOVBMI-NEXT: retq
+;
; CHECK-SKX-VBMI-LABEL: trunc_v32i16_v32i8_zeroes:
; CHECK-SKX-VBMI: # %bb.0:
; CHECK-SKX-VBMI-NEXT: vmovdqa (%rdi), %ymm1
@@ -1192,6 +1239,14 @@ define <16 x i16> @trunc_v16i32_v16i16_sign(ptr %x) nounwind "min-legal-vector-w
}
define <32 x i8> @trunc_v32i16_v32i8_sign(ptr %x) nounwind "min-legal-vector-width"="256" {
+; CHECK-SKX-NOVBMI-LABEL: trunc_v32i16_v32i8_sign:
+; CHECK-SKX-NOVBMI: # %bb.0:
+; CHECK-SKX-NOVBMI-NEXT: vpsrlw $8, 32(%rdi), %ymm0
+; CHECK-SKX-NOVBMI-NEXT: vpsrlw $8, (%rdi), %ymm1
+; CHECK-SKX-NOVBMI-NEXT: vpackuswb %ymm0, %ymm1, %ymm0
+; CHECK-SKX-NOVBMI-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; CHECK-SKX-NOVBMI-NEXT: retq
+;
; CHECK-SKX-VBMI-LABEL: trunc_v32i16_v32i8_sign:
; CHECK-SKX-VBMI: # %bb.0:
; CHECK-SKX-VBMI-NEXT: vmovdqa (%rdi), %ymm1
diff --git a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
index 693d199..9729fd7 100644
--- a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
+++ b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
@@ -100,7 +100,7 @@ define <4 x i1> @p4_vector_urem_by_const__splat(<4 x i32> %x, <4 x i32> %y) {
; SSE4-LABEL: p4_vector_urem_by_const__splat:
; SSE4: # %bb.0:
; SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311531,2863311531,2863311531,2863311531]
; SSE4-NEXT: psrld $1, %xmm0
; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [715827883,715827883,715827883,715827883]
; SSE4-NEXT: pcmpgtd %xmm0, %xmm1
@@ -128,10 +128,10 @@ define <4 x i1> @p5_vector_urem_by_const__nonsplat(<4 x i32> %x, <4 x i32> %y) {
; SSE2: # %bb.0:
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3435973837,u,954437177,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311531,3435973837,2863311531,954437177]
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,2147483648,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; SSE2-NEXT: psrlq $32, %xmm0
@@ -145,7 +145,7 @@ define <4 x i1> @p5_vector_urem_by_const__nonsplat(<4 x i32> %x, <4 x i32> %y) {
; SSE4-LABEL: p5_vector_urem_by_const__nonsplat:
; SSE4: # %bb.0:
; SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311531,3435973837,2863311531,954437177]
; SSE4-NEXT: pmovzxdq {{.*#+}} xmm1 = [1,2147483648]
; SSE4-NEXT: pmuludq %xmm0, %xmm1
; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
@@ -159,7 +159,7 @@ define <4 x i1> @p5_vector_urem_by_const__nonsplat(<4 x i32> %x, <4 x i32> %y) {
; AVX2-LABEL: p5_vector_urem_by_const__nonsplat:
; AVX2: # %bb.0:
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2863311531,3435973837,2863311531,954437177]
; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -196,7 +196,7 @@ define <4 x i1> @p6_vector_urem_by_const__nonsplat_undef0(<4 x i32> %x, <4 x i32
; SSE4-LABEL: p6_vector_urem_by_const__nonsplat_undef0:
; SSE4: # %bb.0:
; SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311531,2863311531,2863311531,2863311531]
; SSE4-NEXT: movdqa %xmm0, %xmm1
; SSE4-NEXT: psrld $1, %xmm1
; SSE4-NEXT: pslld $31, %xmm0
@@ -312,7 +312,7 @@ define <4 x i1> @p8_vector_urem_by_const__nonsplat_undef3(<4 x i32> %x, <4 x i32
; SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; SSE4-NEXT: psrld $2, %xmm2
-; SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [6,6,6,6]
; SSE4-NEXT: psubd %xmm2, %xmm0
; SSE4-NEXT: pxor %xmm1, %xmm1
; SSE4-NEXT: pcmpeqd %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll
index 9aee2f1..00731fe 100644
--- a/llvm/test/CodeGen/X86/pmul.ll
+++ b/llvm/test/CodeGen/X86/pmul.ll
@@ -91,7 +91,7 @@ define <4 x i32> @mul_v4i32c(<4 x i32> %i) nounwind {
;
; SSE41-LABEL: mul_v4i32c:
; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [117,117,117,117]
; SSE41-NEXT: retq
;
; AVX-LABEL: mul_v4i32c:
diff --git a/llvm/test/CodeGen/X86/pr160612.ll b/llvm/test/CodeGen/X86/pr160612.ll
new file mode 100644
index 0000000..6572c42
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr160612.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -O2 | FileCheck %s
+
+; Test for issue #160612: OR conditions in branches should use multiple branches
+; instead of materializing booleans with SETCC when no special optimizations apply.
+
+declare void @subroutine_foo()
+declare void @subroutine_bar()
+
+; Original issue: (x == 0 || y == 0) was generating SETCC + TEST + BRANCH
+; instead of using two conditional branches directly.
+define void @func_a(i32 noundef %x, i32 noundef %y) {
+; CHECK-LABEL: func_a:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: je subroutine_foo@PLT # TAILCALL
+; CHECK-NEXT: # %bb.1: # %entry
+; CHECK-NEXT: testl %esi, %esi
+; CHECK-NEXT: jne subroutine_bar@PLT # TAILCALL
+; CHECK-NEXT: # %bb.2: # %if.then
+; CHECK-NEXT: jmp subroutine_foo@PLT # TAILCALL
+entry:
+ %cmp = icmp eq i32 %x, 0
+ %cmp1 = icmp eq i32 %y, 0
+ %or.cond = or i1 %cmp, %cmp1
+ br i1 %or.cond, label %if.then, label %if.else
+
+if.then:
+ tail call void @subroutine_foo()
+ br label %if.end
+
+if.else:
+ tail call void @subroutine_bar()
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+; Reference implementation that already generated optimal code.
+; This should continue to generate the same optimal code.
+define void @func_b(i32 noundef %x, i32 noundef %y) {
+; CHECK-LABEL: func_b:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: je subroutine_foo@PLT # TAILCALL
+; CHECK-NEXT: # %bb.1: # %if.else
+; CHECK-NEXT: testl %esi, %esi
+; CHECK-NEXT: je subroutine_foo@PLT # TAILCALL
+; CHECK-NEXT: # %bb.2: # %if.else3
+; CHECK-NEXT: jmp subroutine_bar@PLT # TAILCALL
+entry:
+ %cmp = icmp eq i32 %x, 0
+ br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+ tail call void @subroutine_foo()
+ br label %if.end4
+
+if.else:
+ %cmp1 = icmp eq i32 %y, 0
+ br i1 %cmp1, label %if.then2, label %if.else3
+
+if.then2:
+ tail call void @subroutine_foo()
+ br label %if.end4
+
+if.else3:
+ tail call void @subroutine_bar()
+ br label %if.end4
+
+if.end4:
+ ret void
+}
diff --git a/llvm/test/CodeGen/X86/pr162812.ll b/llvm/test/CodeGen/X86/pr162812.ll
index 4ea3101..cec093c 100644
--- a/llvm/test/CodeGen/X86/pr162812.ll
+++ b/llvm/test/CodeGen/X86/pr162812.ll
@@ -34,61 +34,43 @@ define <32 x i8> @PR162812(<32 x i8> %a, <32 x i8> %mask) {
;
; SSE42-LABEL: PR162812:
; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa %xmm2, %xmm5
-; SSE42-NEXT: movdqa %xmm0, %xmm2
+; SSE42-NEXT: movdqa %xmm0, %xmm4
+; SSE42-NEXT: psrlw $2, %xmm2
+; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [8224,8224,8224,8224,8224,8224,8224,8224]
+; SSE42-NEXT: pand %xmm5, %xmm2
+; SSE42-NEXT: paddb %xmm2, %xmm2
+; SSE42-NEXT: paddb %xmm2, %xmm2
; SSE42-NEXT: movdqa %xmm0, %xmm6
-; SSE42-NEXT: psllw $2, %xmm6
-; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; SSE42-NEXT: pand %xmm7, %xmm6
-; SSE42-NEXT: psrlw $2, %xmm5
-; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [8224,8224,8224,8224,8224,8224,8224,8224]
-; SSE42-NEXT: pand %xmm4, %xmm5
+; SSE42-NEXT: paddb %xmm0, %xmm6
+; SSE42-NEXT: movdqa %xmm2, %xmm0
+; SSE42-NEXT: pblendvb %xmm0, %xmm6, %xmm4
+; SSE42-NEXT: psrlw $2, %xmm3
+; SSE42-NEXT: pand %xmm3, %xmm5
; SSE42-NEXT: paddb %xmm5, %xmm5
-; SSE42-NEXT: movdqa %xmm5, %xmm0
-; SSE42-NEXT: pblendvb %xmm0, %xmm6, %xmm2
-; SSE42-NEXT: movdqa %xmm2, %xmm6
-; SSE42-NEXT: paddb %xmm2, %xmm6
; SSE42-NEXT: paddb %xmm5, %xmm5
+; SSE42-NEXT: movdqa %xmm1, %xmm2
+; SSE42-NEXT: paddb %xmm1, %xmm2
; SSE42-NEXT: movdqa %xmm5, %xmm0
-; SSE42-NEXT: pblendvb %xmm0, %xmm6, %xmm2
-; SSE42-NEXT: movdqa %xmm1, %xmm5
-; SSE42-NEXT: psllw $2, %xmm5
-; SSE42-NEXT: pand %xmm7, %xmm5
-; SSE42-NEXT: psrlw $2, %xmm3
-; SSE42-NEXT: pand %xmm3, %xmm4
-; SSE42-NEXT: paddb %xmm4, %xmm4
-; SSE42-NEXT: movdqa %xmm4, %xmm0
-; SSE42-NEXT: pblendvb %xmm0, %xmm5, %xmm1
-; SSE42-NEXT: movdqa %xmm1, %xmm3
-; SSE42-NEXT: paddb %xmm1, %xmm3
-; SSE42-NEXT: paddb %xmm4, %xmm4
+; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm1
; SSE42-NEXT: movdqa %xmm4, %xmm0
-; SSE42-NEXT: pblendvb %xmm0, %xmm3, %xmm1
-; SSE42-NEXT: movdqa %xmm2, %xmm0
; SSE42-NEXT: retq
;
; AVX2-LABEL: PR162812:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllw $2, %ymm0, %ymm2
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2
; AVX2-NEXT: vpsrlw $2, %ymm1, %ymm1
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
-; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2
; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: PR162812:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllw $2, %ymm0, %ymm2
-; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2
+; AVX512-NEXT: vpaddb %ymm0, %ymm0, %ymm2
; AVX512-NEXT: vpsrlw $2, %ymm1, %ymm1
; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
; AVX512-NEXT: vpaddb %ymm1, %ymm1, %ymm1
-; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX512-NEXT: vpaddb %ymm0, %ymm0, %ymm2
; AVX512-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; AVX512-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/pr49087.ll b/llvm/test/CodeGen/X86/pr49087.ll
deleted file mode 100644
index 1a29222..0000000
--- a/llvm/test/CodeGen/X86/pr49087.ll
+++ /dev/null
@@ -1,30 +0,0 @@
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -o - -global-isel < %s 2>&1 | FileCheck %s
-; REQUIRES: asserts
-; XFAIL: *
-
-define i32 @test_01(ptr %p, i64 %len, i32 %x) {
-; CHECK-LABEL: test_01
-
-entry:
- %scevgep = getelementptr i32, ptr %p, i64 -1
- br label %loop
-
-loop: ; preds = %backedge, %entry
- %iv = phi i64 [ %iv.next, %backedge ], [ %len, %entry ]
- %iv.next = add i64 %iv, -1
- %cond_1 = icmp eq i64 %iv, 0
- br i1 %cond_1, label %exit, label %backedge
-
-backedge: ; preds = %loop
- %scevgep1 = getelementptr i32, ptr %scevgep, i64 %iv
- %loaded = load atomic i32, ptr %scevgep1 unordered, align 4
- %cond_2 = icmp eq i32 %loaded, %x
- br i1 %cond_2, label %failure, label %loop
-
-exit: ; preds = %loop
- ret i32 -1
-
-failure:
- unreachable
-}
-
diff --git a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll
index 885b075..59b03f8 100644
--- a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll
+++ b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll
@@ -9,7 +9,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) {
; AVX256BW: # %bb.0:
; AVX256BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX256BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX256BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX256BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
; AVX256BW-NEXT: vpmullw %ymm3, %ymm2, %ymm2
; AVX256BW-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX256BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
diff --git a/llvm/test/CodeGen/X86/relptr-rodata.ll b/llvm/test/CodeGen/X86/relptr-rodata.ll
index ea22b08..954ea8f 100644
--- a/llvm/test/CodeGen/X86/relptr-rodata.ll
+++ b/llvm/test/CodeGen/X86/relptr-rodata.ll
@@ -10,16 +10,31 @@ target triple = "x86_64-unknown-linux-gnu"
; CHECK: .long hidden-rodata
@rodata = hidden constant i32 trunc (i64 sub (i64 ptrtoint (ptr @hidden to i64), i64 ptrtoint (ptr @rodata to i64)) to i32)
+; CHECK: .section .rodata.rodata_ptrtoaddr
+; CHECK: rodata_ptrtoaddr:
+; CHECK: .long hidden-rodata_ptrtoaddr
+@rodata_ptrtoaddr = hidden constant i32 trunc (i64 sub (i64 ptrtoaddr (ptr @hidden to i64), i64 ptrtoaddr (ptr @rodata_ptrtoaddr to i64)) to i32)
+
; CHECK: .section .data.rel.ro.relro1
; CHECK: relro1:
; CHECK: .long default-relro1
@relro1 = hidden constant i32 trunc (i64 sub (i64 ptrtoint (ptr @default to i64), i64 ptrtoint (ptr @relro1 to i64)) to i32)
+; CHECK: .section .data.rel.ro.relro1_ptrtoaddr
+; CHECK: relro1_ptrtoaddr:
+; CHECK: .long default-relro1_ptrtoaddr
+@relro1_ptrtoaddr = hidden constant i32 trunc (i64 sub (i64 ptrtoaddr (ptr @default to i64), i64 ptrtoaddr (ptr @relro1_ptrtoaddr to i64)) to i32)
+
; CHECK: .section .data.rel.ro.relro2
; CHECK: relro2:
; CHECK: .long hidden-relro2
@relro2 = constant i32 trunc (i64 sub (i64 ptrtoint (ptr @hidden to i64), i64 ptrtoint (ptr @relro2 to i64)) to i32)
+; CHECK: .section .data.rel.ro.relro2_ptrtoaddr
+; CHECK: relro2_ptrtoaddr:
+; CHECK: .long hidden-relro2_ptrtoaddr
+@relro2_ptrtoaddr = constant i32 trunc (i64 sub (i64 ptrtoaddr (ptr @hidden to i64), i64 ptrtoaddr (ptr @relro2_ptrtoaddr to i64)) to i32)
+
; CHECK: .section .rodata.obj
; CHECK-NEXT: .globl obj
; CHECK: obj:
diff --git a/llvm/test/CodeGen/X86/rotate-extract-vector.ll b/llvm/test/CodeGen/X86/rotate-extract-vector.ll
index 1ead3f9..7d0ec64 100644
--- a/llvm/test/CodeGen/X86/rotate-extract-vector.ll
+++ b/llvm/test/CodeGen/X86/rotate-extract-vector.ll
@@ -149,19 +149,12 @@ define <32 x i16> @illegal_no_extract_mul(<32 x i16> %i) nounwind {
; Result would undershift
define <4 x i64> @no_extract_shl(<4 x i64> %i) nounwind {
-; X86-LABEL: no_extract_shl:
-; X86: # %bb.0:
-; X86-NEXT: vpsllq $24, %ymm0, %ymm1
-; X86-NEXT: vpsrlq $39, %ymm0, %ymm0
-; X86-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %ymm1, %ymm0
-; X86-NEXT: retl
-;
-; X64-LABEL: no_extract_shl:
-; X64: # %bb.0:
-; X64-NEXT: vpsllq $24, %ymm0, %ymm1
-; X64-NEXT: vpsrlq $39, %ymm0, %ymm0
-; X64-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: no_extract_shl:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsllq $24, %ymm0, %ymm1
+; CHECK-NEXT: vpsrlq $39, %ymm0, %ymm0
+; CHECK-NEXT: vpternlogq {{.*#+}} ymm0 = (ymm0 & m64bcst) | ymm1
+; CHECK-NEXT: ret{{[l|q]}}
%lhs_mul = shl <4 x i64> %i, <i64 11, i64 11, i64 11, i64 11>
%rhs_mul = shl <4 x i64> %i, <i64 24, i64 24, i64 24, i64 24>
%lhs_shift = lshr <4 x i64> %lhs_mul, <i64 50, i64 50, i64 50, i64 50>
@@ -171,19 +164,12 @@ define <4 x i64> @no_extract_shl(<4 x i64> %i) nounwind {
; Result would overshift
define <4 x i32> @no_extract_shrl(<4 x i32> %i) nounwind {
-; X86-LABEL: no_extract_shrl:
-; X86: # %bb.0:
-; X86-NEXT: vpsrld $9, %xmm0, %xmm1
-; X86-NEXT: vpslld $25, %xmm0, %xmm0
-; X86-NEXT: vpternlogd $236, {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm1, %xmm0
-; X86-NEXT: retl
-;
-; X64-LABEL: no_extract_shrl:
-; X64: # %bb.0:
-; X64-NEXT: vpsrld $9, %xmm0, %xmm1
-; X64-NEXT: vpslld $25, %xmm0, %xmm0
-; X64-NEXT: vpternlogd $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
-; X64-NEXT: retq
+; CHECK-LABEL: no_extract_shrl:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsrld $9, %xmm0, %xmm1
+; CHECK-NEXT: vpslld $25, %xmm0, %xmm0
+; CHECK-NEXT: vpternlogd {{.*#+}} xmm0 = (xmm0 & m32bcst) | xmm1
+; CHECK-NEXT: ret{{[l|q]}}
%lhs_div = lshr <4 x i32> %i, <i32 3, i32 3, i32 3, i32 3>
%rhs_div = lshr <4 x i32> %i, <i32 9, i32 9, i32 9, i32 9>
%lhs_shift = shl <4 x i32> %lhs_div, <i32 28, i32 28, i32 28, i32 28>
diff --git a/llvm/test/CodeGen/X86/sdiv-exact.ll b/llvm/test/CodeGen/X86/sdiv-exact.ll
index 4568191..7873ffa 100644
--- a/llvm/test/CodeGen/X86/sdiv-exact.ll
+++ b/llvm/test/CodeGen/X86/sdiv-exact.ll
@@ -87,7 +87,7 @@ define <4 x i32> @test5(<4 x i32> %x) {
; X86-NEXT: pmuludq %xmm1, %xmm0
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [2863311531,u,3264175145,u]
; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-NEXT: retl
@@ -95,7 +95,7 @@ define <4 x i32> @test5(<4 x i32> %x) {
; X64-LABEL: test5:
; X64: # %bb.0:
; X64-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2863311531,2863311531,3264175145,3264175145]
; X64-NEXT: retq
%div = sdiv exact <4 x i32> %x, <i32 24, i32 24, i32 25, i32 25>
ret <4 x i32> %div
@@ -112,7 +112,7 @@ define <4 x i32> @test6(<4 x i32> %x) {
; X86-NEXT: pmuludq %xmm0, %xmm1
; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [2863311531,u,3303820997,u]
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; X86-NEXT: movdqa %xmm1, %xmm0
@@ -121,7 +121,7 @@ define <4 x i32> @test6(<4 x i32> %x) {
; X64-LABEL: test6:
; X64: # %bb.0:
; X64-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2863311531,2863311531,3303820997,3303820997]
; X64-NEXT: retq
%div = sdiv exact <4 x i32> %x, <i32 24, i32 24, i32 26, i32 26>
ret <4 x i32> %div
@@ -131,16 +131,16 @@ define <4 x i32> @test7(<4 x i32> %x) {
; X86-LABEL: test7:
; X86: # %bb.0:
; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [3264175145,3264175145,1749801491,1749801491]
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [3264175145,u,1749801491,u]
; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-NEXT: retl
;
; X64-LABEL: test7:
; X64: # %bb.0:
-; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3264175145,3264175145,1749801491,1749801491]
; X64-NEXT: retq
%div = sdiv exact <4 x i32> %x, <i32 25, i32 25, i32 27, i32 27>
ret <4 x i32> %div
@@ -156,7 +156,7 @@ define <4 x i32> @test8(<4 x i32> %x) {
; X86-NEXT: pmuludq %xmm1, %xmm0
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [1,u,2863311531,u]
; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-NEXT: retl
@@ -164,7 +164,7 @@ define <4 x i32> @test8(<4 x i32> %x) {
; X64-LABEL: test8:
; X64: # %bb.0:
; X64-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,1,2863311531,2863311531]
; X64-NEXT: retq
%div = sdiv exact <4 x i32> %x, <i32 1, i32 1, i32 24, i32 24>
ret <4 x i32> %div
diff --git a/llvm/test/CodeGen/X86/setcc-wide-types.ll b/llvm/test/CodeGen/X86/setcc-wide-types.ll
index 5aa266d..d018c53 100644
--- a/llvm/test/CodeGen/X86/setcc-wide-types.ll
+++ b/llvm/test/CodeGen/X86/setcc-wide-types.ll
@@ -1447,3 +1447,175 @@ define i1 @eq_i512_load_arg(ptr%p, i512 %b) {
%r = icmp eq i512 %a, %b
ret i1 %r
}
+
+; Tests for any/allbits from memory.
+
+define i1 @anybits_i128_load_arg(ptr %w) {
+; ANY-LABEL: anybits_i128_load_arg:
+; ANY: # %bb.0:
+; ANY-NEXT: movq (%rdi), %rax
+; ANY-NEXT: orq 8(%rdi), %rax
+; ANY-NEXT: setne %al
+; ANY-NEXT: retq
+ %ld = load i128, ptr %w
+ %cmp = icmp ne i128 %ld, 0
+ ret i1 %cmp
+}
+
+define i1 @allbits_i128_load_arg(ptr %w) {
+; SSE2-LABEL: allbits_i128_load_arg:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT: pcmpeqb (%rdi), %xmm0
+; SSE2-NEXT: pmovmskb %xmm0, %eax
+; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
+; SSE2-NEXT: sete %al
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: allbits_i128_load_arg:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa (%rdi), %xmm0
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT: ptest %xmm1, %xmm0
+; SSE41-NEXT: setb %al
+; SSE41-NEXT: retq
+;
+; AVXANY-LABEL: allbits_i128_load_arg:
+; AVXANY: # %bb.0:
+; AVXANY-NEXT: vmovdqa (%rdi), %xmm0
+; AVXANY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVXANY-NEXT: vptest %xmm1, %xmm0
+; AVXANY-NEXT: setb %al
+; AVXANY-NEXT: retq
+ %ld = load i128, ptr %w
+ %cmp = icmp eq i128 %ld, -1
+ ret i1 %cmp
+}
+
+define i1 @anybits_i256_load_arg(ptr %w) {
+; SSE-LABEL: anybits_i256_load_arg:
+; SSE: # %bb.0:
+; SSE-NEXT: movq (%rdi), %rax
+; SSE-NEXT: movq 8(%rdi), %rcx
+; SSE-NEXT: orq 24(%rdi), %rcx
+; SSE-NEXT: orq 16(%rdi), %rax
+; SSE-NEXT: orq %rcx, %rax
+; SSE-NEXT: setne %al
+; SSE-NEXT: retq
+;
+; AVXANY-LABEL: anybits_i256_load_arg:
+; AVXANY: # %bb.0:
+; AVXANY-NEXT: vmovdqu (%rdi), %ymm0
+; AVXANY-NEXT: vptest %ymm0, %ymm0
+; AVXANY-NEXT: setne %al
+; AVXANY-NEXT: vzeroupper
+; AVXANY-NEXT: retq
+ %ld = load i256, ptr %w
+ %cmp = icmp ne i256 %ld, 0
+ ret i1 %cmp
+}
+
+define i1 @allbits_i256_load_arg(ptr %w) {
+; SSE-LABEL: allbits_i256_load_arg:
+; SSE: # %bb.0:
+; SSE-NEXT: movq (%rdi), %rax
+; SSE-NEXT: movq 8(%rdi), %rcx
+; SSE-NEXT: andq 24(%rdi), %rcx
+; SSE-NEXT: andq 16(%rdi), %rax
+; SSE-NEXT: andq %rcx, %rax
+; SSE-NEXT: cmpq $-1, %rax
+; SSE-NEXT: sete %al
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: allbits_i256_load_arg:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqu (%rdi), %ymm0
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
+; AVX1-NEXT: vptest %ymm1, %ymm0
+; AVX1-NEXT: setb %al
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: allbits_i256_load_arg:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqu (%rdi), %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vptest %ymm1, %ymm0
+; AVX2-NEXT: setb %al
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: allbits_i256_load_arg:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqu (%rdi), %ymm0
+; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512-NEXT: vptest %ymm1, %ymm0
+; AVX512-NEXT: setb %al
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %ld = load i256, ptr %w
+ %cmp = icmp eq i256 %ld, -1
+ ret i1 %cmp
+}
+
+define i1 @anybits_i512_load_arg(ptr %w) {
+; NO512-LABEL: anybits_i512_load_arg:
+; NO512: # %bb.0:
+; NO512-NEXT: movq 16(%rdi), %rax
+; NO512-NEXT: movq (%rdi), %rcx
+; NO512-NEXT: movq 8(%rdi), %rdx
+; NO512-NEXT: movq 24(%rdi), %rsi
+; NO512-NEXT: orq 56(%rdi), %rsi
+; NO512-NEXT: orq 40(%rdi), %rdx
+; NO512-NEXT: orq %rsi, %rdx
+; NO512-NEXT: orq 48(%rdi), %rax
+; NO512-NEXT: orq 32(%rdi), %rcx
+; NO512-NEXT: orq %rax, %rcx
+; NO512-NEXT: orq %rdx, %rcx
+; NO512-NEXT: setne %al
+; NO512-NEXT: retq
+;
+; AVX512-LABEL: anybits_i512_load_arg:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512-NEXT: kortestw %k0, %k0
+; AVX512-NEXT: setne %al
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %ld = load i512, ptr %w
+ %cmp = icmp ne i512 %ld, 0
+ ret i1 %cmp
+}
+
+define i1 @allbits_i512_load_arg(ptr %w) {
+; NO512-LABEL: allbits_i512_load_arg:
+; NO512: # %bb.0:
+; NO512-NEXT: movq 16(%rdi), %rax
+; NO512-NEXT: movq (%rdi), %rcx
+; NO512-NEXT: movq 8(%rdi), %rdx
+; NO512-NEXT: movq 24(%rdi), %rsi
+; NO512-NEXT: andq 56(%rdi), %rsi
+; NO512-NEXT: andq 40(%rdi), %rdx
+; NO512-NEXT: andq %rsi, %rdx
+; NO512-NEXT: andq 48(%rdi), %rax
+; NO512-NEXT: andq 32(%rdi), %rcx
+; NO512-NEXT: andq %rax, %rcx
+; NO512-NEXT: andq %rdx, %rcx
+; NO512-NEXT: cmpq $-1, %rcx
+; NO512-NEXT: sete %al
+; NO512-NEXT: retq
+;
+; AVX512-LABEL: allbits_i512_load_arg:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 = -1
+; AVX512-NEXT: vpcmpneqd (%rdi), %zmm0, %k0
+; AVX512-NEXT: kortestw %k0, %k0
+; AVX512-NEXT: sete %al
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %ld = load i512, ptr %w
+ %cmp = icmp eq i512 %ld, -1
+ ret i1 %cmp
+}
diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll
index e53eed4..504a392 100644
--- a/llvm/test/CodeGen/X86/shrink_vmul.ll
+++ b/llvm/test/CodeGen/X86/shrink_vmul.ll
@@ -1760,7 +1760,7 @@ define void @mul_2xi16_varconst1(ptr nocapture readonly %a, i64 %index) {
; X86-AVX-NEXT: movl c, %edx
; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X86-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [0,65535,u,u]
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
;
@@ -1781,7 +1781,7 @@ define void @mul_2xi16_varconst1(ptr nocapture readonly %a, i64 %index) {
; X64-AVX-NEXT: movq c(%rip), %rax
; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X64-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,65535,u,u]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
entry:
@@ -1864,7 +1864,7 @@ define void @mul_2xi16_varconst3(ptr nocapture readonly %a, i64 %index) {
; X86-SSE-NEXT: movl c, %edx
; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [65536,65536,65536,65536]
; X86-SSE-NEXT: psllq $32, %xmm0
; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
; X86-SSE-NEXT: retl
@@ -1876,7 +1876,7 @@ define void @mul_2xi16_varconst3(ptr nocapture readonly %a, i64 %index) {
; X86-AVX-NEXT: movl c, %edx
; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X86-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [0,65536,u,u]
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
;
@@ -1885,7 +1885,7 @@ define void @mul_2xi16_varconst3(ptr nocapture readonly %a, i64 %index) {
; X64-SSE-NEXT: movq c(%rip), %rax
; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; X64-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [65536,65536,65536,65536]
; X64-SSE-NEXT: psllq $32, %xmm0
; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
; X64-SSE-NEXT: retq
@@ -1895,7 +1895,7 @@ define void @mul_2xi16_varconst3(ptr nocapture readonly %a, i64 %index) {
; X64-AVX-NEXT: movq c(%rip), %rax
; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X64-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,65536,u,u]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
entry:
@@ -1922,7 +1922,7 @@ define void @mul_2xi16_varconst4(ptr nocapture readonly %a, i64 %index) {
; X86-SSE-NEXT: movl c, %edx
; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: psrad $16, %xmm0
-; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [32768,32768,32768,32768]
; X86-SSE-NEXT: psllq $32, %xmm0
; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
; X86-SSE-NEXT: retl
@@ -1934,7 +1934,7 @@ define void @mul_2xi16_varconst4(ptr nocapture readonly %a, i64 %index) {
; X86-AVX-NEXT: movl c, %edx
; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
-; X86-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [0,32768,u,u]
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
;
@@ -1943,7 +1943,7 @@ define void @mul_2xi16_varconst4(ptr nocapture readonly %a, i64 %index) {
; X64-SSE-NEXT: movq c(%rip), %rax
; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-SSE-NEXT: psrad $16, %xmm0
-; X64-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,32768,32768,32768]
; X64-SSE-NEXT: psllq $32, %xmm0
; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
; X64-SSE-NEXT: retq
@@ -1953,7 +1953,7 @@ define void @mul_2xi16_varconst4(ptr nocapture readonly %a, i64 %index) {
; X64-AVX-NEXT: movq c(%rip), %rax
; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
-; X64-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,32768,u,u]
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
entry:
diff --git a/llvm/test/CodeGen/X86/slow-pmulld.ll b/llvm/test/CodeGen/X86/slow-pmulld.ll
index 975ffd0..e8c05f9 100644
--- a/llvm/test/CodeGen/X86/slow-pmulld.ll
+++ b/llvm/test/CodeGen/X86/slow-pmulld.ll
@@ -336,13 +336,13 @@ define <4 x i32> @test_mul_v4i32_v4i16(<4 x i16> %A) {
; SSE4-32-LABEL: test_mul_v4i32_v4i16:
; SSE4-32: # %bb.0:
; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE4-32-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; SSE4-32-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [18778,18778,18778,18778]
; SSE4-32-NEXT: retl
;
; SSE4-64-LABEL: test_mul_v4i32_v4i16:
; SSE4-64: # %bb.0:
; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE4-64-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE4-64-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [18778,18778,18778,18778]
; SSE4-64-NEXT: retq
;
; AVX2-SLOW-LABEL: test_mul_v4i32_v4i16:
@@ -838,13 +838,13 @@ define <4 x i32> @test_mul_v4i32_v4i16_minsize(<4 x i16> %A) minsize {
; SSE-32-LABEL: test_mul_v4i32_v4i16_minsize:
; SSE-32: # %bb.0:
; SSE-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE-32-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; SSE-32-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [18778,18778,18778,18778]
; SSE-32-NEXT: retl
;
; SSE-64-LABEL: test_mul_v4i32_v4i16_minsize:
; SSE-64: # %bb.0:
; SSE-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE-64-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-64-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [18778,18778,18778,18778]
; SSE-64-NEXT: retq
;
; AVX2-LABEL: test_mul_v4i32_v4i16_minsize:
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll
index 42617c1..18588aa 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll
@@ -24,7 +24,7 @@ define float @sqrt_ieee_ninf(float %f) #0 {
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0
; CHECK-NEXT: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[VRSQRTSSr:%[0-9]+]]:fr32 = VRSQRTSSr killed [[DEF]], [[COPY]]
+ ; CHECK-NEXT: [[VRSQRTSSr:%[0-9]+]]:fr32 = ninf afn VRSQRTSSr killed [[DEF]], [[COPY]]
; CHECK-NEXT: [[VMULSSrr:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr
; CHECK-NEXT: [[VMOVSSrm_alt:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load (s32) from constant-pool)
; CHECK-NEXT: [[VFMADD213SSr:%[0-9]+]]:fr32 = ninf afn nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed [[VMULSSrr]], [[VMOVSSrm_alt]], implicit $mxcsr
@@ -71,7 +71,7 @@ define float @sqrt_daz_ninf(float %f) #1 {
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0
; CHECK-NEXT: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[VRSQRTSSr:%[0-9]+]]:fr32 = VRSQRTSSr killed [[DEF]], [[COPY]]
+ ; CHECK-NEXT: [[VRSQRTSSr:%[0-9]+]]:fr32 = ninf afn VRSQRTSSr killed [[DEF]], [[COPY]]
; CHECK-NEXT: [[VMULSSrr:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr
; CHECK-NEXT: [[VMOVSSrm_alt:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load (s32) from constant-pool)
; CHECK-NEXT: [[VFMADD213SSr:%[0-9]+]]:fr32 = ninf afn nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed [[VMULSSrr]], [[VMOVSSrm_alt]], implicit $mxcsr
diff --git a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
index cc4bda8..650b562 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
@@ -1,6 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=i686-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=X86
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=X64
; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 < %s | FileCheck %s --check-prefixes=X64,SSE2
; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 < %s | FileCheck %s --check-prefixes=X64,SSE41
; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx < %s | FileCheck %s --check-prefixes=X64,AVX1
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
index 2d07788..bb7245c 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
@@ -10,15 +10,15 @@ define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_even:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,3264175145,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -30,10 +30,10 @@ define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_odd_even:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,3264175145,3264175145]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -47,10 +47,10 @@ define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_odd_even:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,3264175145,3264175145]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -63,7 +63,7 @@ define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_srem_odd_even:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,3264175145,3264175145]
; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -75,7 +75,7 @@ define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_srem_odd_even:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,3264175145,3264175145]
; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -109,7 +109,7 @@ define <4 x i32> @test_srem_odd_allones_eq(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_odd_allones_eq:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,858993458,4294967295,858993458]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
@@ -119,7 +119,7 @@ define <4 x i32> @test_srem_odd_allones_eq(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_odd_allones_eq:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -168,7 +168,7 @@ define <4 x i32> @test_srem_odd_allones_ne(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_odd_allones_ne:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,858993458,4294967295,858993458]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
@@ -178,7 +178,7 @@ define <4 x i32> @test_srem_odd_allones_ne(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_odd_allones_ne:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -234,7 +234,7 @@ define <4 x i32> @test_srem_even_allones_eq(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_even_allones_eq:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,3067833783,3067833783]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE41-NEXT: psrld $1, %xmm1
@@ -248,7 +248,7 @@ define <4 x i32> @test_srem_even_allones_eq(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_even_allones_eq:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,3067833783,3067833783]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0
@@ -308,7 +308,7 @@ define <4 x i32> @test_srem_even_allones_ne(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_even_allones_ne:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,3067833783,3067833783]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE41-NEXT: psrld $1, %xmm1
@@ -322,7 +322,7 @@ define <4 x i32> @test_srem_even_allones_ne(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_even_allones_ne:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,3067833783,3067833783]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0
@@ -367,15 +367,15 @@ define <4 x i32> @test_srem_odd_even_allones_eq(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_even_allones_eq:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -387,10 +387,10 @@ define <4 x i32> @test_srem_odd_even_allones_eq(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_odd_even_allones_eq:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -404,10 +404,10 @@ define <4 x i32> @test_srem_odd_even_allones_eq(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_odd_even_allones_eq:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -420,7 +420,7 @@ define <4 x i32> @test_srem_odd_even_allones_eq(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_srem_odd_even_allones_eq:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -432,7 +432,7 @@ define <4 x i32> @test_srem_odd_even_allones_eq(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_srem_odd_even_allones_eq:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -448,15 +448,15 @@ define <4 x i32> @test_srem_odd_even_allones_ne(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_even_allones_ne:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -468,10 +468,10 @@ define <4 x i32> @test_srem_odd_even_allones_ne(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_odd_even_allones_ne:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -485,10 +485,10 @@ define <4 x i32> @test_srem_odd_even_allones_ne(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_odd_even_allones_ne:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -501,7 +501,7 @@ define <4 x i32> @test_srem_odd_even_allones_ne(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_srem_odd_even_allones_ne:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -514,7 +514,7 @@ define <4 x i32> @test_srem_odd_even_allones_ne(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_srem_odd_even_allones_ne:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -534,14 +534,14 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_poweroftwo:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,u,1,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,268435456,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; CHECK-SSE2-NEXT: psrlq $32, %xmm0
@@ -553,7 +553,7 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_odd_poweroftwo:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,1,3435973837]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456]
; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
@@ -568,9 +568,9 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_odd_poweroftwo:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,1,3435973837]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,u,268435456,u]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -581,7 +581,7 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_srem_odd_poweroftwo:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,1,3435973837]
; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -593,7 +593,7 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_srem_odd_poweroftwo:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,1,3435973837]
; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -611,9 +611,9 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_even_poweroftwo:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,u,1,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,3067833783,3067833783,3067833783]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -636,11 +636,11 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_even_poweroftwo:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,1,3067833783]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,268435456,2147483648]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -654,11 +654,11 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_even_poweroftwo:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,1,3067833783]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,2147483648,2147483648]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,2147483648,268435456,2147483648]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -671,7 +671,7 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_srem_even_poweroftwo:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,1,3067833783]
; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -683,7 +683,7 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_srem_even_poweroftwo:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,1,3067833783]
; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -701,9 +701,9 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_even_poweroftwo:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,1,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -711,7 +711,7 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [2147483648,u,1073741824,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
@@ -725,11 +725,11 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_odd_even_poweroftwo:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,1,3264175145]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,u,1073741824,u]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2147483648,268435456,1073741824]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -743,11 +743,11 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_odd_even_poweroftwo:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,1,3264175145]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,u,1073741824,u]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2147483648,268435456,1073741824]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -760,7 +760,7 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_srem_odd_even_poweroftwo:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,1,3264175145]
; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -772,7 +772,7 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_srem_odd_even_poweroftwo:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,1,3264175145]
; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -806,7 +806,7 @@ define <4 x i32> @test_srem_odd_one(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_odd_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,858993458,4294967295,858993458]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
@@ -816,7 +816,7 @@ define <4 x i32> @test_srem_odd_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_odd_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -871,7 +871,7 @@ define <4 x i32> @test_srem_even_one(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_even_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,3067833783,3067833783]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE41-NEXT: psrld $1, %xmm1
@@ -885,7 +885,7 @@ define <4 x i32> @test_srem_even_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_even_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,3067833783,3067833783]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0
@@ -929,15 +929,15 @@ define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_even_one:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -949,10 +949,10 @@ define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_odd_even_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -966,10 +966,10 @@ define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_odd_even_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -982,7 +982,7 @@ define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_srem_odd_even_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -994,7 +994,7 @@ define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_srem_odd_even_one:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -1018,9 +1018,9 @@ define <4 x i32> @test_srem_odd_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pand %xmm0, %xmm2
; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm2
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,u,1,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
@@ -1039,7 +1039,7 @@ define <4 x i32> @test_srem_odd_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647]
; CHECK-SSE41-NEXT: pand %xmm0, %xmm2
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm2
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,1,3435973837]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,858993458,1,858993458]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
@@ -1053,7 +1053,7 @@ define <4 x i32> @test_srem_odd_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,1,3435973837]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
; CHECK-AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
@@ -1067,7 +1067,7 @@ define <4 x i32> @test_srem_odd_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647]
; CHECK-AVX2-NEXT: vpand %xmm2, %xmm0, %xmm2
; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,1,3435973837]
; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
; CHECK-AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
@@ -1080,7 +1080,7 @@ define <4 x i32> @test_srem_odd_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm2
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,1,3435973837]
; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
@@ -1102,7 +1102,7 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [3067833783,3067833783,3067833783,3067833783]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -1137,8 +1137,8 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [2147483648,2147483648,2,2147483648]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
@@ -1156,11 +1156,11 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-AVX1-LABEL: test_srem_even_INT_MIN:
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [3067833783,3067833783,1,3067833783]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [2147483648,2147483648,2147483648,2147483648]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [2147483648,2147483648,2,2147483648]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
@@ -1177,7 +1177,7 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-AVX2-LABEL: test_srem_even_INT_MIN:
; CHECK-AVX2: # %bb.0:
; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [3067833783,3067833783,1,3067833783]
; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
@@ -1196,7 +1196,7 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm2
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,1,3067833783]
; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
@@ -1219,7 +1219,7 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [3067833783,u,3264175145,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -1227,7 +1227,7 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,u,1073741824,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
@@ -1253,8 +1253,8 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [2147483648,u,1073741824,u]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1,2147483648,2,1073741824]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
@@ -1272,11 +1272,11 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-AVX1-LABEL: test_srem_odd_even_INT_MIN:
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [3435973837,3067833783,1,3264175145]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [2147483648,u,1073741824,u]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [1,2147483648,2,1073741824]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
@@ -1293,7 +1293,7 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-AVX2-LABEL: test_srem_odd_even_INT_MIN:
; CHECK-AVX2: # %bb.0:
; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [3435973837,3067833783,1,3264175145]
; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
@@ -1312,7 +1312,7 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm2
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,1,3264175145]
; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
@@ -1333,14 +1333,14 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_allones_and_poweroftwo:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,0,1,3435973837]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,u,3435973837,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,268435456,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; CHECK-SSE2-NEXT: psrlq $32, %xmm0
@@ -1352,7 +1352,7 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_odd_allones_and_poweroftwo:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,0,1,3435973837]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456]
; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
@@ -1367,9 +1367,9 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_odd_allones_and_poweroftwo:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,0,1,3435973837]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,u,268435456,u]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -1380,7 +1380,7 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_srem_odd_allones_and_poweroftwo:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,0,1,3435973837]
; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -1392,7 +1392,7 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_srem_odd_allones_and_poweroftwo:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,0,1,3435973837]
; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -1410,9 +1410,9 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_even_allones_and_poweroftwo:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,0,1,3067833783]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,u,3067833783,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1420,7 +1420,7 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [1,u,2147483648,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
@@ -1434,11 +1434,11 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_even_allones_and_poweroftwo:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,0,1,3067833783]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,2147483648,u]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,1,268435456,2147483648]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1452,11 +1452,11 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_even_allones_and_poweroftwo:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,0,1,3067833783]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,u,2147483648,u]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,1,268435456,2147483648]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1469,7 +1469,7 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_srem_even_allones_and_poweroftwo:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,0,1,3067833783]
; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -1481,7 +1481,7 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_srem_even_allones_and_poweroftwo:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,0,1,3067833783]
; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -1499,9 +1499,9 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
; CHECK-SSE2-LABEL: test_srem_odd_even_allones_and_poweroftwo:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,0,1,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,u,3264175145,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1509,7 +1509,7 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [1,u,1073741824,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
@@ -1523,11 +1523,11 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
;
; CHECK-SSE41-LABEL: test_srem_odd_even_allones_and_poweroftwo:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,0,1,3264175145]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,1073741824,u]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,1,268435456,1073741824]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1541,11 +1541,11 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
;
; CHECK-AVX1-LABEL: test_srem_odd_even_allones_and_poweroftwo:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,0,1,3264175145]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,u,1073741824,u]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,1,268435456,1073741824]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1558,7 +1558,7 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
;
; CHECK-AVX2-LABEL: test_srem_odd_even_allones_and_poweroftwo:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,0,1,3264175145]
; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -1570,7 +1570,7 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
;
; CHECK-AVX512VL-LABEL: test_srem_odd_even_allones_and_poweroftwo:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,0,1,3264175145]
; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -1604,7 +1604,7 @@ define <4 x i32> @test_srem_odd_allones_and_one(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_odd_allones_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,4294967295,4294967295,858993458]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
@@ -1614,7 +1614,7 @@ define <4 x i32> @test_srem_odd_allones_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_odd_allones_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -1669,7 +1669,7 @@ define <4 x i32> @test_srem_even_allones_and_one(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_even_allones_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,3067833783,3067833783]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE41-NEXT: psrld $1, %xmm1
@@ -1683,7 +1683,7 @@ define <4 x i32> @test_srem_even_allones_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_even_allones_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,3067833783,3067833783]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0
@@ -1727,15 +1727,15 @@ define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_even_allones_and_one:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,0,0,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,u,3264175145,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,1,1073741824,1073741824]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1747,10 +1747,10 @@ define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_odd_even_allones_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,0,0,3264175145]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,1,1073741824,1073741824]
; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1764,10 +1764,10 @@ define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_odd_even_allones_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,0,0,3264175145]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,1,1073741824,1073741824]
; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1780,7 +1780,7 @@ define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_srem_odd_even_allones_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,0,0,3264175145]
; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -1792,7 +1792,7 @@ define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_srem_odd_even_allones_and_one:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,0,0,3264175145]
; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -1812,15 +1812,15 @@ define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,1,0,3435973837]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,3435973837,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,268435456,1,1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1832,10 +1832,10 @@ define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_odd_poweroftwo_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,1,0,3435973837]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [268435456,268435456,1,1]
; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1849,10 +1849,10 @@ define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_odd_poweroftwo_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,1,0,3435973837]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [268435456,268435456,1,1]
; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1865,7 +1865,7 @@ define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_srem_odd_poweroftwo_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,1,0,3435973837]
; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -1877,7 +1877,7 @@ define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_srem_odd_poweroftwo_and_one:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,1,0,3435973837]
; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -1895,9 +1895,9 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_even_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,1,0,3067833783]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,3067833783,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1905,7 +1905,7 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [268435456,u,2147483648,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
@@ -1919,11 +1919,11 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_even_poweroftwo_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,1,0,3067833783]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [268435456,u,2147483648,u]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,268435456,1,2147483648]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1937,11 +1937,11 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_even_poweroftwo_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,1,0,3067833783]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [268435456,u,2147483648,u]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,268435456,1,2147483648]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1954,7 +1954,7 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_srem_even_poweroftwo_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,1,0,3067833783]
; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -1966,7 +1966,7 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_srem_even_poweroftwo_and_one:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,1,0,3067833783]
; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -1984,15 +1984,15 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_even_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,1,0,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,3264175145,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,268435456,1073741824,1073741824]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -2004,10 +2004,10 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_odd_even_poweroftwo_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,1,0,3264175145]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [268435456,268435456,1073741824,1073741824]
; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -2021,10 +2021,10 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_odd_even_poweroftwo_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,1,0,3264175145]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [268435456,268435456,1073741824,1073741824]
; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -2037,7 +2037,7 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_srem_odd_even_poweroftwo_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,1,0,3264175145]
; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -2049,7 +2049,7 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_srem_odd_even_poweroftwo_and_one:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,1,0,3264175145]
; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -2067,9 +2067,9 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_odd_allones_and_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,u,1,u]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,268435456,u]
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE2-NEXT: psrlq $32, %xmm1
; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -2081,9 +2081,9 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
;
; CHECK-SSE41-LABEL: test_srem_odd_allones_and_poweroftwo_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,u,1,u]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,268435456,u]
; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: psrlq $32, %xmm0
@@ -2096,9 +2096,9 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
;
; CHECK-AVX1-LABEL: test_srem_odd_allones_and_poweroftwo_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,u,1,u]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,u,268435456,u]
; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
@@ -2110,7 +2110,7 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
;
; CHECK-AVX2-LABEL: test_srem_odd_allones_and_poweroftwo_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,0,1,0]
; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -2122,7 +2122,7 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
;
; CHECK-AVX512VL-LABEL: test_srem_odd_allones_and_poweroftwo_and_one:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,0,1,0]
; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -2138,9 +2138,9 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_srem_even_allones_and_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,u,1,u]
; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,u,268435456,u]
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE2-NEXT: psrlq $32, %xmm1
; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -2152,9 +2152,9 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
;
; CHECK-SSE41-LABEL: test_srem_even_allones_and_poweroftwo_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,u,1,u]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,u,268435456,u]
; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: psrlq $32, %xmm0
@@ -2167,9 +2167,9 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
;
; CHECK-AVX1-LABEL: test_srem_even_allones_and_poweroftwo_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,u,1,u]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,u,268435456,u]
; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
@@ -2181,7 +2181,7 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
;
; CHECK-AVX2-LABEL: test_srem_even_allones_and_poweroftwo_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,0,1,0]
; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -2193,7 +2193,7 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
;
; CHECK-AVX512VL-LABEL: test_srem_even_allones_and_poweroftwo_and_one:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,0,1,0]
; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -2335,10 +2335,10 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [34048,34048,26368,37632,21760,33024,22016,35072]
+; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,133,0,133,0,103,0,147,0,85,0,129,0,86,0,137]
; CHECK-AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; CHECK-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [20224,26368,6912,30976,33024,33024,33024,12032]
+; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,79,0,103,0,27,0,121,0,129,0,129,0,129,0,47]
; CHECK-AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; CHECK-AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm4
; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm5 # [0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0]
@@ -2369,10 +2369,10 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-AVX1-NEXT: vpsubb %xmm4, %xmm0, %xmm4
; CHECK-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; CHECK-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 # [2304,0,10496,37632,33024,33024,21760,36096]
+; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 # [0,9,0,0,0,41,0,147,0,129,0,129,0,85,0,141]
; CHECK-AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
; CHECK-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7, %xmm7 # [22016,24320,37632,11008,12544,32512,16640,37632]
+; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7, %xmm7 # [0,86,0,95,0,147,0,43,0,49,0,127,0,65,0,147]
; CHECK-AVX1-NEXT: vpsrlw $8, %xmm7, %xmm7
; CHECK-AVX1-NEXT: vpackuswb %xmm6, %xmm7, %xmm6
; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm7 # [0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0]
@@ -2417,10 +2417,10 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-AVX2: # %bb.0:
; CHECK-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
-; CHECK-AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [34048,34048,26368,37632,21760,33024,22016,35072,2304,0,10496,37632,33024,33024,21760,36096]
+; CHECK-AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,133,0,133,0,103,0,147,0,85,0,129,0,86,0,137,0,9,0,0,0,41,0,147,0,129,0,129,0,85,0,141]
; CHECK-AVX2-NEXT: vpsrlw $8, %ymm3, %ymm3
; CHECK-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
-; CHECK-AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [20224,26368,6912,30976,33024,33024,33024,12032,22016,24320,37632,11008,12544,32512,16640,37632]
+; CHECK-AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,79,0,103,0,27,0,121,0,129,0,129,0,129,0,47,0,86,0,95,0,147,0,43,0,49,0,127,0,65,0,147]
; CHECK-AVX2-NEXT: vpsrlw $8, %ymm4, %ymm4
; CHECK-AVX2-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
; CHECK-AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 # [0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0]
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll
index 3359202..d459d01 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll
@@ -24,7 +24,7 @@ define <4 x i32> @test_srem_odd_25(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_odd_25:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3264175145,3264175145,3264175145,3264175145]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [171798690,171798690,171798690,171798690]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
@@ -34,7 +34,7 @@ define <4 x i32> @test_srem_odd_25(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_odd_25:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3264175145,3264175145,3264175145,3264175145]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -90,7 +90,7 @@ define <4 x i32> @test_srem_even_100(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_even_100:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3264175145,3264175145,3264175145,3264175145]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE41-NEXT: psrld $2, %xmm1
@@ -104,7 +104,7 @@ define <4 x i32> @test_srem_even_100(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_even_100:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3264175145,3264175145,3264175145,3264175145]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $2, %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpslld $30, %xmm0, %xmm0
@@ -165,7 +165,7 @@ define <4 x i32> @test_srem_odd_neg25(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_odd_neg25:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3264175145,3264175145,3264175145,3264175145]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [171798690,171798690,171798690,171798690]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
@@ -175,7 +175,7 @@ define <4 x i32> @test_srem_odd_neg25(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_odd_neg25:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3264175145,3264175145,3264175145,3264175145]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -231,7 +231,7 @@ define <4 x i32> @test_srem_even_neg100(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_srem_even_neg100:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3264175145,3264175145,3264175145,3264175145]
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE41-NEXT: psrld $2, %xmm1
@@ -245,7 +245,7 @@ define <4 x i32> @test_srem_even_neg100(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_srem_even_neg100:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3264175145,3264175145,3264175145,3264175145]
; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $2, %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpslld $30, %xmm0, %xmm0
@@ -333,7 +333,7 @@ define <4 x i32> @test_srem_odd_undef1(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: psrld $31, %xmm1
; CHECK-SSE41-NEXT: psrad $3, %xmm2
; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [25,25,25,25]
; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
@@ -351,7 +351,7 @@ define <4 x i32> @test_srem_odd_undef1(<4 x i32> %X) nounwind {
; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2
; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [25,25,25,25]
; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -444,7 +444,7 @@ define <4 x i32> @test_srem_even_undef1(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: psrld $31, %xmm1
; CHECK-SSE41-NEXT: psrad $5, %xmm2
; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [100,100,100,100]
; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
@@ -462,7 +462,7 @@ define <4 x i32> @test_srem_even_undef1(<4 x i32> %X) nounwind {
; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2
; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [100,100,100,100]
; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/udiv-exact.ll b/llvm/test/CodeGen/X86/udiv-exact.ll
index 271d11e..2b3f26a 100644
--- a/llvm/test/CodeGen/X86/udiv-exact.ll
+++ b/llvm/test/CodeGen/X86/udiv-exact.ll
@@ -87,7 +87,7 @@ define <4 x i32> @test5(<4 x i32> %x) {
; X86-NEXT: pmuludq %xmm1, %xmm0
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [2863311531,u,3264175145,u]
; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-NEXT: retl
@@ -95,7 +95,7 @@ define <4 x i32> @test5(<4 x i32> %x) {
; X64-LABEL: test5:
; X64: # %bb.0:
; X64-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2863311531,2863311531,3264175145,3264175145]
; X64-NEXT: retq
%div = udiv exact <4 x i32> %x, <i32 24, i32 24, i32 25, i32 25>
ret <4 x i32> %div
@@ -112,7 +112,7 @@ define <4 x i32> @test6(<4 x i32> %x) {
; X86-NEXT: pmuludq %xmm0, %xmm1
; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [2863311531,u,3303820997,u]
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; X86-NEXT: movdqa %xmm1, %xmm0
@@ -121,7 +121,7 @@ define <4 x i32> @test6(<4 x i32> %x) {
; X64-LABEL: test6:
; X64: # %bb.0:
; X64-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2863311531,2863311531,3303820997,3303820997]
; X64-NEXT: retq
%div = udiv exact <4 x i32> %x, <i32 24, i32 24, i32 26, i32 26>
ret <4 x i32> %div
@@ -131,16 +131,16 @@ define <4 x i32> @test7(<4 x i32> %x) {
; X86-LABEL: test7:
; X86: # %bb.0:
; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [3264175145,3264175145,1749801491,1749801491]
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [3264175145,u,1749801491,u]
; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-NEXT: retl
;
; X64-LABEL: test7:
; X64: # %bb.0:
-; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3264175145,3264175145,1749801491,1749801491]
; X64-NEXT: retq
%div = udiv exact <4 x i32> %x, <i32 25, i32 25, i32 27, i32 27>
ret <4 x i32> %div
@@ -156,7 +156,7 @@ define <4 x i32> @test8(<4 x i32> %x) {
; X86-NEXT: pmuludq %xmm1, %xmm0
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [1,u,2863311531,u]
; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-NEXT: retl
@@ -164,7 +164,7 @@ define <4 x i32> @test8(<4 x i32> %x) {
; X64-LABEL: test8:
; X64: # %bb.0:
; X64-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,1,2863311531,2863311531]
; X64-NEXT: retq
%div = udiv exact <4 x i32> %x, <i32 1, i32 1, i32 24, i32 24>
ret <4 x i32> %div
diff --git a/llvm/test/CodeGen/X86/undo-mul-and.ll b/llvm/test/CodeGen/X86/undo-mul-and.ll
index c9c40099..6566153 100644
--- a/llvm/test/CodeGen/X86/undo-mul-and.ll
+++ b/llvm/test/CodeGen/X86/undo-mul-and.ll
@@ -63,9 +63,9 @@ define <4 x i32> @mul_and_to_neg_shl_and_vec_fail_no_splat(<4 x i32> %x) {
; CHECK-SSE-LABEL: mul_and_to_neg_shl_and_vec_fail_no_splat:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [56,56,56,64]
; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [56,u,64,u]
; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -73,13 +73,13 @@ define <4 x i32> @mul_and_to_neg_shl_and_vec_fail_no_splat(<4 x i32> %x) {
;
; CHECK-AVX1-LABEL: mul_and_to_neg_shl_and_vec_fail_no_splat:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [56,56,56,64]
; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX512-LABEL: mul_and_to_neg_shl_and_vec_fail_no_splat:
; CHECK-AVX512: # %bb.0:
-; CHECK-AVX512-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [56,56,56,64]
; CHECK-AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
; CHECK-AVX512-NEXT: retq
%mul = mul <4 x i32> %x, <i32 56, i32 56, i32 56, i32 64>
@@ -92,9 +92,9 @@ define <4 x i32> @mul_and_to_neg_shl_and_vec_todo_no_splat1(<4 x i32> %x) {
; CHECK-SSE-LABEL: mul_and_to_neg_shl_and_vec_todo_no_splat1:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [56,56,56,48]
; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [56,u,48,u]
; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -102,13 +102,13 @@ define <4 x i32> @mul_and_to_neg_shl_and_vec_todo_no_splat1(<4 x i32> %x) {
;
; CHECK-AVX1-LABEL: mul_and_to_neg_shl_and_vec_todo_no_splat1:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [56,56,56,48]
; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
; CHECK-AVX512-LABEL: mul_and_to_neg_shl_and_vec_todo_no_splat1:
; CHECK-AVX512: # %bb.0:
-; CHECK-AVX512-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [56,56,56,48]
; CHECK-AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
; CHECK-AVX512-NEXT: retq
%mul = mul <4 x i32> %x, <i32 56, i32 56, i32 56, i32 48>
@@ -131,7 +131,7 @@ define <4 x i32> @mul_and_to_neg_shl_and_vec_todo_no_splat2(<4 x i32> %x) {
;
; CHECK-AVX1-LABEL: mul_and_to_neg_shl_and_vec_todo_no_splat2:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [56,56,56,56]
; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
index 7c1a1e2..759055d 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
@@ -1,6 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=i686-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=X86
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=X64
; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 < %s | FileCheck %s --check-prefixes=X64,SSE2
; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 < %s | FileCheck %s --check-prefixes=X64,SSE41
; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx < %s | FileCheck %s --check-prefixes=X64,AVX1
@@ -168,7 +167,7 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
; SSE41-NEXT: pinsrd $1, %esi, %xmm0
; SSE41-NEXT: pinsrd $2, %edx, %xmm0
; SSE41-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [683,1463,819,u]
; SSE41-NEXT: pmovsxwd {{.*#+}} xmm1 = [2047,2047,2047,2047]
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: pand %xmm1, %xmm2
@@ -194,7 +193,7 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
; AVX1-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0
; AVX1-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0
; AVX1-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [683,1463,819,u]
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [2047,2047,2047,2047]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
@@ -219,7 +218,7 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
; AVX2-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0
; AVX2-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0
; AVX2-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [683,1463,819,u]
; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2047,2047,2047,2047]
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
@@ -241,7 +240,7 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
; AVX512VL-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0
; AVX512VL-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0
; AVX512VL-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [683,1463,819,u]
; AVX512VL-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2047,2047,2047,2047]
; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
index 838086e..2228c09 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
@@ -10,10 +10,10 @@ define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_even:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,3264175145,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -26,9 +26,9 @@ define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_odd_even:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,3264175145,3264175145]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -42,9 +42,9 @@ define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_odd_even:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,3264175145,3264175145]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -57,7 +57,7 @@ define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_odd_even:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,3264175145,3264175145]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -68,7 +68,7 @@ define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_odd_even:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,3264175145,3264175145]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -87,9 +87,9 @@ define <4 x i32> @test_urem_odd_allones_eq(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_allones_eq:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,4294967295,3435973837]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -99,7 +99,7 @@ define <4 x i32> @test_urem_odd_allones_eq(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_odd_allones_eq:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,4294967295,3435973837]
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,1,858993459]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
@@ -108,7 +108,7 @@ define <4 x i32> @test_urem_odd_allones_eq(<4 x i32> %X) nounwind {
;
; CHECK-AVX-LABEL: test_urem_odd_allones_eq:
; CHECK-AVX: # %bb.0:
-; CHECK-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,4294967295,3435973837]
; CHECK-AVX-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX-NEXT: vpsrld $31, %xmm0, %xmm0
@@ -122,9 +122,9 @@ define <4 x i32> @test_urem_odd_allones_ne(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_allones_ne:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,4294967295,3435973837]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -134,7 +134,7 @@ define <4 x i32> @test_urem_odd_allones_ne(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_odd_allones_ne:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,4294967295,3435973837]
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993460,858993460,2,858993460]
; CHECK-SSE41-NEXT: pmaxud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
@@ -143,7 +143,7 @@ define <4 x i32> @test_urem_odd_allones_ne(<4 x i32> %X) nounwind {
;
; CHECK-AVX-LABEL: test_urem_odd_allones_ne:
; CHECK-AVX: # %bb.0:
-; CHECK-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,4294967295,3435973837]
; CHECK-AVX-NEXT: vpmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX-NEXT: vpsrld $31, %xmm0, %xmm0
@@ -159,12 +159,12 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_allones_eq:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,3067833783,3067833783,3067833783]
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,4294967295,3067833783]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,1,2147483648]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
@@ -178,10 +178,10 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_even_allones_eq:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,4294967295,3067833783]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,1,2147483648]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -195,10 +195,10 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_even_allones_eq:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,4294967295,3067833783]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,2147483648,2147483648]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,2147483648,1,2147483648]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -211,7 +211,7 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_even_allones_eq:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,4294967295,3067833783]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -222,7 +222,7 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_even_allones_eq:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,4294967295,3067833783]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -237,12 +237,12 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_allones_ne:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,3067833783,3067833783,3067833783]
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,4294967295,3067833783]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,1,2147483648]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
@@ -256,10 +256,10 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_even_allones_ne:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,4294967295,3067833783]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,1,2147483648]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -273,10 +273,10 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_even_allones_ne:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,4294967295,3067833783]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,2147483648,2147483648]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,2147483648,1,2147483648]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -289,7 +289,7 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_even_allones_ne:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,4294967295,3067833783]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -300,7 +300,7 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_even_allones_ne:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,4294967295,3067833783]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -317,10 +317,10 @@ define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_even_allones_eq:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,4294967295,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -333,9 +333,9 @@ define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_odd_even_allones_eq:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,4294967295,3264175145]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -349,9 +349,9 @@ define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_odd_even_allones_eq:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,4294967295,3264175145]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -364,7 +364,7 @@ define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_odd_even_allones_eq:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,4294967295,3264175145]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -375,7 +375,7 @@ define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_eq:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,4294967295,3264175145]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -390,10 +390,10 @@ define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_even_allones_ne:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,4294967295,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -406,9 +406,9 @@ define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_odd_even_allones_ne:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,4294967295,3264175145]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -422,9 +422,9 @@ define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_odd_even_allones_ne:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,4294967295,3264175145]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -437,7 +437,7 @@ define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_odd_even_allones_ne:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,4294967295,3264175145]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -448,7 +448,7 @@ define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_ne:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,4294967295,3264175145]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -467,10 +467,10 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_poweroftwo:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,1,3435973837]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,268435456,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; CHECK-SSE2-NEXT: psrlq $32, %xmm0
@@ -482,7 +482,7 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_odd_poweroftwo:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,1,3435973837]
; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456]
; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
@@ -496,8 +496,8 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_odd_poweroftwo:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,1,3435973837]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,u,268435456,u]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -508,7 +508,7 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_odd_poweroftwo:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,1,3435973837]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -519,7 +519,7 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_odd_poweroftwo:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,1,3435973837]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -536,12 +536,12 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_poweroftwo:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,3067833783,3067833783,3067833783]
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,1,3067833783]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,268435456,2147483648]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
@@ -555,10 +555,10 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_even_poweroftwo:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,1,3067833783]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,268435456,2147483648]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -572,10 +572,10 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_even_poweroftwo:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,1,3067833783]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,2147483648,2147483648]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,2147483648,268435456,2147483648]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -588,7 +588,7 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_even_poweroftwo:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,1,3067833783]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -599,7 +599,7 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_even_poweroftwo:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,1,3067833783]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -616,11 +616,11 @@ define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_even_poweroftwo:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,1,3264175145]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2147483648,268435456,1073741824]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,u,1073741824,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -634,10 +634,10 @@ define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_odd_even_poweroftwo:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,1,3264175145]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,u,1073741824,u]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2147483648,268435456,1073741824]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -651,10 +651,10 @@ define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_odd_even_poweroftwo:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,1,3264175145]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,u,1073741824,u]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2147483648,268435456,1073741824]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -667,7 +667,7 @@ define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_odd_even_poweroftwo:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,1,3264175145]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -678,7 +678,7 @@ define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_odd_even_poweroftwo:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,1,3264175145]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -710,7 +710,7 @@ define <4 x i32> @test_urem_odd_one(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_odd_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,4294967295,858993459]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
@@ -719,7 +719,7 @@ define <4 x i32> @test_urem_odd_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_odd_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
@@ -769,7 +769,7 @@ define <4 x i32> @test_urem_even_one(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_even_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,3067833783,3067833783]
; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE41-NEXT: psrld $1, %xmm1
; CHECK-SSE41-NEXT: pslld $31, %xmm0
@@ -782,7 +782,7 @@ define <4 x i32> @test_urem_even_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_even_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,3067833783,3067833783]
; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -822,10 +822,10 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_even_one:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -838,9 +838,9 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_odd_even_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -854,9 +854,9 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_odd_even_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,1073741824,1073741824]
; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -869,7 +869,7 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_odd_even_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -880,7 +880,7 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_odd_even_one:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,0,3264175145]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -899,10 +899,10 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_INT_MIN:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,1,3435973837]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,2,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; CHECK-SSE2-NEXT: psrlq $32, %xmm0
@@ -914,7 +914,7 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_odd_INT_MIN:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,1,3435973837]
; CHECK-SSE41-NEXT: pmovsxbq {{.*#+}} xmm1 = [1,2]
; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
@@ -928,8 +928,8 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_odd_INT_MIN:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,1,3435973837]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,u,2,u]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -940,7 +940,7 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_odd_INT_MIN:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,1,3435973837]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -951,7 +951,7 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_odd_INT_MIN:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,1,3435973837]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -968,12 +968,12 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_INT_MIN:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,3067833783,3067833783,3067833783]
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,1,3067833783]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,2,2147483648]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
@@ -987,10 +987,10 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_even_INT_MIN:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,1,3067833783]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,2,2147483648]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1004,10 +1004,10 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_even_INT_MIN:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,1,3067833783]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,2147483648,2147483648]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,2147483648,2,2147483648]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1020,7 +1020,7 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_even_INT_MIN:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,1,3067833783]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -1031,7 +1031,7 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_even_INT_MIN:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,1,3067833783]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -1048,11 +1048,11 @@ define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_even_INT_MIN:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,1,3264175145]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2147483648,2,1073741824]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,u,1073741824,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1066,10 +1066,10 @@ define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_odd_even_INT_MIN:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,1,3264175145]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,u,1073741824,u]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2147483648,2,1073741824]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1083,10 +1083,10 @@ define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_odd_even_INT_MIN:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,1,3264175145]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,u,1073741824,u]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2147483648,2,1073741824]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1099,7 +1099,7 @@ define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_odd_even_INT_MIN:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,1,3264175145]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -1110,7 +1110,7 @@ define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_odd_even_INT_MIN:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,1,3264175145]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -1129,10 +1129,10 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_allones_and_poweroftwo:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,u,3435973837,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,1,3435973837]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,268435456,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; CHECK-SSE2-NEXT: psrlq $32, %xmm0
@@ -1144,7 +1144,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,1,3435973837]
; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456]
; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
@@ -1158,8 +1158,8 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_odd_allones_and_poweroftwo:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,1,3435973837]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,u,268435456,u]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -1170,7 +1170,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_odd_allones_and_poweroftwo:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,1,3435973837]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -1181,7 +1181,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_odd_allones_and_poweroftwo:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,1,3435973837]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -1198,11 +1198,11 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_allones_and_poweroftwo:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,4294967295,1,3067833783]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,1,268435456,2147483648]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,u,3067833783,u]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,2147483648,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1216,10 +1216,10 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_even_allones_and_poweroftwo:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,4294967295,1,3067833783]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,2147483648,u]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,1,268435456,2147483648]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1233,10 +1233,10 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_even_allones_and_poweroftwo:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,4294967295,1,3067833783]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,u,2147483648,u]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,1,268435456,2147483648]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1249,7 +1249,7 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_even_allones_and_poweroftwo:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,4294967295,1,3067833783]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -1260,7 +1260,7 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_even_allones_and_poweroftwo:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,4294967295,1,3067833783]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -1277,11 +1277,11 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
; CHECK-SSE2-LABEL: test_urem_odd_even_allones_and_poweroftwo:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,1,3264175145]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,1,268435456,1073741824]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,u,3264175145,u]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,1073741824,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1295,10 +1295,10 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
;
; CHECK-SSE41-LABEL: test_urem_odd_even_allones_and_poweroftwo:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,1,3264175145]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,1073741824,u]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,1,268435456,1073741824]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1312,10 +1312,10 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
;
; CHECK-AVX1-LABEL: test_urem_odd_even_allones_and_poweroftwo:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,1,3264175145]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,u,1073741824,u]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,1,268435456,1073741824]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1328,7 +1328,7 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
;
; CHECK-AVX2-LABEL: test_urem_odd_even_allones_and_poweroftwo:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,1,3264175145]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -1339,7 +1339,7 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
;
; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_and_poweroftwo:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,1,3264175145]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -1358,9 +1358,9 @@ define <4 x i32> @test_urem_odd_allones_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_allones_and_one:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,0,3435973837]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,u,3435973837,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1370,7 +1370,7 @@ define <4 x i32> @test_urem_odd_allones_and_one(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_odd_allones_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,0,3435973837]
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,1,4294967295,858993459]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
@@ -1379,7 +1379,7 @@ define <4 x i32> @test_urem_odd_allones_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX-LABEL: test_urem_odd_allones_and_one:
; CHECK-AVX: # %bb.0:
-; CHECK-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,0,3435973837]
; CHECK-AVX-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX-NEXT: vpsrld $31, %xmm0, %xmm0
@@ -1395,11 +1395,11 @@ define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_allones_and_one:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,4294967295,0,3067833783]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,1,1,2147483648]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,u,3067833783,u]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,2147483648,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1413,10 +1413,10 @@ define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_even_allones_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,4294967295,0,3067833783]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,2147483648,u]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,1,1,2147483648]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1430,10 +1430,10 @@ define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_even_allones_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,4294967295,0,3067833783]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,u,2147483648,u]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,1,1,2147483648]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1446,7 +1446,7 @@ define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_even_allones_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,4294967295,0,3067833783]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -1457,7 +1457,7 @@ define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_even_allones_and_one:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,4294967295,0,3067833783]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -1474,10 +1474,10 @@ define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_even_allones_and_one:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,0,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,u,3264175145,u]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,1,1073741824,1073741824]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -1490,9 +1490,9 @@ define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_odd_even_allones_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,0,3264175145]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,1,1073741824,1073741824]
; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1506,9 +1506,9 @@ define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_odd_even_allones_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,0,3264175145]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,1,1073741824,1073741824]
; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1521,7 +1521,7 @@ define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_odd_even_allones_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,0,3264175145]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -1532,7 +1532,7 @@ define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_and_one:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,0,3264175145]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -1551,10 +1551,10 @@ define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,1,0,3435973837]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,3435973837,u]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [268435456,268435456,1,1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -1567,9 +1567,9 @@ define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_odd_poweroftwo_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,1,0,3435973837]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [268435456,268435456,1,1]
; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1583,9 +1583,9 @@ define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_odd_poweroftwo_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,1,0,3435973837]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [268435456,268435456,1,1]
; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1598,7 +1598,7 @@ define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_odd_poweroftwo_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,1,0,3435973837]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -1609,7 +1609,7 @@ define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_odd_poweroftwo_and_one:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,1,0,3435973837]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -1626,11 +1626,11 @@ define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,1,0,3067833783]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,268435456,1,2147483648]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,3067833783,u]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [268435456,u,2147483648,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1644,10 +1644,10 @@ define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_even_poweroftwo_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,1,0,3067833783]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [268435456,u,2147483648,u]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,268435456,1,2147483648]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1661,10 +1661,10 @@ define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_even_poweroftwo_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,1,0,3067833783]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [268435456,u,2147483648,u]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,268435456,1,2147483648]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1677,7 +1677,7 @@ define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_even_poweroftwo_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,1,0,3067833783]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -1688,7 +1688,7 @@ define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_even_poweroftwo_and_one:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,1,0,3067833783]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -1705,10 +1705,10 @@ define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_even_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,1,0,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,3264175145,u]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [268435456,268435456,1073741824,1073741824]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -1721,9 +1721,9 @@ define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_odd_even_poweroftwo_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,1,0,3264175145]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [268435456,268435456,1073741824,1073741824]
; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1737,9 +1737,9 @@ define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_odd_even_poweroftwo_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,1,0,3264175145]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [268435456,268435456,1073741824,1073741824]
; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1752,7 +1752,7 @@ define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_odd_even_poweroftwo_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,1,0,3264175145]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -1763,7 +1763,7 @@ define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_odd_even_poweroftwo_and_one:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,1,0,3264175145]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -1781,10 +1781,10 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
; CHECK-SSE2-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,0,0,0]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,1,0]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,268435456,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; CHECK-SSE2-NEXT: psrlq $32, %xmm0
@@ -1796,7 +1796,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
;
; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,1,0]
; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456]
; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
@@ -1810,8 +1810,8 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
;
; CHECK-AVX1-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,1,0]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,u,268435456,u]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -1822,7 +1822,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
;
; CHECK-AVX2-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,1,0]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -1833,7 +1833,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
;
; CHECK-AVX512VL-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,1,0]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -1849,10 +1849,10 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
; CHECK-SSE2-LABEL: test_urem_even_allones_and_poweroftwo_and_one:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,0,0,0]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,4294967295,1,0]
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,u,268435456,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; CHECK-SSE2-NEXT: psrlq $32, %xmm0
@@ -1864,7 +1864,7 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
;
; CHECK-SSE41-LABEL: test_urem_even_allones_and_poweroftwo_and_one:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,4294967295,1,0]
; CHECK-SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = [2147483648,268435456]
; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
@@ -1878,8 +1878,8 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
;
; CHECK-AVX1-LABEL: test_urem_even_allones_and_poweroftwo_and_one:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,4294967295,1,0]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [2147483648,u,268435456,u]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -1890,7 +1890,7 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
;
; CHECK-AVX2-LABEL: test_urem_even_allones_and_poweroftwo_and_one:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,4294967295,1,0]
; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -1901,7 +1901,7 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
;
; CHECK-AVX512VL-LABEL: test_urem_even_allones_and_poweroftwo_and_one:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,4294967295,1,0]
; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll
index 6a36cd2..8042103 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll
@@ -25,7 +25,7 @@ define <4 x i1> @t32_3(<4 x i32> %X) nounwind {
; CHECK-SSE41-LABEL: t32_3:
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311531,2863311531,2863311531,2863311531]
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1431655765,1431655764,1431655764,1431655764]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
@@ -34,7 +34,7 @@ define <4 x i1> @t32_3(<4 x i32> %X) nounwind {
; CHECK-AVX1-LABEL: t32_3:
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2863311531,2863311531,2863311531,2863311531]
; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
@@ -80,7 +80,7 @@ define <4 x i1> @t32_5(<4 x i32> %X) nounwind {
; CHECK-SSE41-LABEL: t32_5:
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,858993458,858993458,858993458]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
@@ -89,7 +89,7 @@ define <4 x i1> @t32_5(<4 x i32> %X) nounwind {
; CHECK-AVX1-LABEL: t32_5:
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,3435973837,3435973837]
; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: retq
@@ -140,7 +140,7 @@ define <4 x i1> @t32_6_part0(<4 x i32> %X) nounwind {
; CHECK-SSE41-LABEL: t32_6_part0:
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311531,2863311531,2863311531,2863311531]
; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE41-NEXT: psrld $1, %xmm1
; CHECK-SSE41-NEXT: pslld $31, %xmm0
@@ -153,7 +153,7 @@ define <4 x i1> @t32_6_part0(<4 x i32> %X) nounwind {
; CHECK-AVX1-LABEL: t32_6_part0:
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2863311531,2863311531,2863311531,2863311531]
; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -211,7 +211,7 @@ define <4 x i1> @t32_6_part1(<4 x i32> %X) nounwind {
; CHECK-SSE41-LABEL: t32_6_part1:
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311531,2863311531,2863311531,2863311531]
; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE41-NEXT: psrld $1, %xmm1
; CHECK-SSE41-NEXT: pslld $31, %xmm0
@@ -224,7 +224,7 @@ define <4 x i1> @t32_6_part1(<4 x i32> %X) nounwind {
; CHECK-AVX1-LABEL: t32_6_part1:
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2863311531,2863311531,2863311531,2863311531]
; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -275,7 +275,7 @@ define <4 x i1> @t32_tautological(<4 x i32> %X) nounwind {
; CHECK-SSE41-LABEL: t32_tautological:
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311531,2863311531,2863311531,2863311531]
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295,4294967295,1431655764]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
@@ -286,7 +286,7 @@ define <4 x i1> @t32_tautological(<4 x i32> %X) nounwind {
; CHECK-AVX1-LABEL: t32_tautological:
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2863311531,2863311531,2863311531,2863311531]
; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll
index 2166e43..b490c3c 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll
@@ -23,7 +23,7 @@ define <4 x i32> @test_urem_odd_25(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_odd_25:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3264175145,3264175145,3264175145,3264175145]
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [171798691,171798691,171798691,171798691]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
@@ -32,7 +32,7 @@ define <4 x i32> @test_urem_odd_25(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_odd_25:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3264175145,3264175145,3264175145,3264175145]
; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
@@ -83,7 +83,7 @@ define <4 x i32> @test_urem_even_100(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_even_100:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3264175145,3264175145,3264175145,3264175145]
; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE41-NEXT: psrld $2, %xmm1
; CHECK-SSE41-NEXT: pslld $30, %xmm0
@@ -96,7 +96,7 @@ define <4 x i32> @test_urem_even_100(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_even_100:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3264175145,3264175145,3264175145,3264175145]
; CHECK-AVX1-NEXT: vpsrld $2, %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpslld $30, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -139,9 +139,9 @@ define <4 x i32> @test_urem_odd_neg25(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_neg25:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3264175145,1030792151,1030792151,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1030792151,u,3264175145,u]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -151,7 +151,7 @@ define <4 x i32> @test_urem_odd_neg25(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_odd_neg25:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3264175145,1030792151,1030792151,3264175145]
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [171798691,1,1,171798691]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
@@ -160,7 +160,7 @@ define <4 x i32> @test_urem_odd_neg25(<4 x i32> %X) nounwind {
;
; CHECK-AVX-LABEL: test_urem_odd_neg25:
; CHECK-AVX: # %bb.0:
-; CHECK-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3264175145,1030792151,1030792151,3264175145]
; CHECK-AVX-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX-NEXT: vpsrld $31, %xmm0, %xmm0
@@ -176,9 +176,9 @@ define <4 x i32> @test_urem_even_neg100(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_neg100:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4252017623,3264175145,4252017623,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3264175145,3264175145,3264175145,3264175145]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
@@ -192,7 +192,7 @@ define <4 x i32> @test_urem_even_neg100(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: test_urem_even_neg100:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4252017623,3264175145,4252017623,3264175145]
; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE41-NEXT: psrld $2, %xmm1
; CHECK-SSE41-NEXT: pslld $30, %xmm0
@@ -205,7 +205,7 @@ define <4 x i32> @test_urem_even_neg100(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: test_urem_even_neg100:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4252017623,3264175145,4252017623,3264175145]
; CHECK-AVX1-NEXT: vpsrld $2, %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpslld $30, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -216,7 +216,7 @@ define <4 x i32> @test_urem_even_neg100(<4 x i32> %X) nounwind {
;
; CHECK-AVX2-LABEL: test_urem_even_neg100:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4252017623,3264175145,4252017623,3264175145]
; CHECK-AVX2-NEXT: vpsrld $2, %xmm0, %xmm1
; CHECK-AVX2-NEXT: vpslld $30, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -227,7 +227,7 @@ define <4 x i32> @test_urem_even_neg100(<4 x i32> %X) nounwind {
;
; CHECK-AVX512VL-LABEL: test_urem_even_neg100:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4252017623,3264175145,4252017623,3264175145]
; CHECK-AVX512VL-NEXT: vprord $2, %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -277,7 +277,7 @@ define <4 x i32> @test_urem_odd_undef1(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: psrld $3, %xmm2
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [25,25,25,25]
; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
@@ -293,7 +293,7 @@ define <4 x i32> @test_urem_odd_undef1(<4 x i32> %X) nounwind {
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpsrld $3, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [25,25,25,25]
; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -372,7 +372,7 @@ define <4 x i32> @test_urem_even_undef1(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: psrld $5, %xmm2
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [100,100,100,100]
; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
@@ -388,7 +388,7 @@ define <4 x i32> @test_urem_even_undef1(<4 x i32> %X) nounwind {
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [100,100,100,100]
; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll
index 84856aa..e5b19a5 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll
@@ -25,7 +25,7 @@ define <4 x i1> @t0_all_tautological(<4 x i32> %X) nounwind {
define <4 x i1> @t1_all_odd_eq(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: t1_all_odd_eq:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311531,2863311531,2863311531,2863311531]
; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -33,7 +33,7 @@ define <4 x i1> @t1_all_odd_eq(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: t1_all_odd_eq:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311531,2863311531,2863311531,2863311531]
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1431655765,4294967295,4294967295,4294967295]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
@@ -43,7 +43,7 @@ define <4 x i1> @t1_all_odd_eq(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: t1_all_odd_eq:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2863311531,2863311531,2863311531,2863311531]
; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -76,7 +76,7 @@ define <4 x i1> @t1_all_odd_eq(<4 x i32> %X) nounwind {
define <4 x i1> @t1_all_odd_ne(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: t1_all_odd_ne:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311531,2863311531,2863311531,2863311531]
; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -84,7 +84,7 @@ define <4 x i1> @t1_all_odd_ne(<4 x i32> %X) nounwind {
;
; CHECK-SSE41-LABEL: t1_all_odd_ne:
; CHECK-SSE41: # %bb.0:
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311531,2863311531,2863311531,2863311531]
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1431655765,4294967295,4294967295,4294967295]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
@@ -95,7 +95,7 @@ define <4 x i1> @t1_all_odd_ne(<4 x i32> %X) nounwind {
;
; CHECK-AVX1-LABEL: t1_all_odd_ne:
; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2863311531,2863311531,2863311531,2863311531]
; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
@@ -187,7 +187,7 @@ define <2 x i1> @t3_wide(<2 x i64> %X) nounwind {
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3
; CHECK-SSE2-NEXT: psrlq $32, %xmm3
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311530,2863311530]
; CHECK-SSE2-NEXT: paddq %xmm3, %xmm0
; CHECK-SSE2-NEXT: psllq $32, %xmm0
; CHECK-SSE2-NEXT: paddq %xmm2, %xmm0
@@ -212,7 +212,7 @@ define <2 x i1> @t3_wide(<2 x i64> %X) nounwind {
; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm3
; CHECK-SSE41-NEXT: psrlq $32, %xmm3
; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311530,2863311530]
; CHECK-SSE41-NEXT: paddq %xmm3, %xmm0
; CHECK-SSE41-NEXT: psllq $32, %xmm0
; CHECK-SSE41-NEXT: paddq %xmm2, %xmm0
@@ -236,7 +236,7 @@ define <2 x i1> @t3_wide(<2 x i64> %X) nounwind {
; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
; CHECK-AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3
; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2863311530,2863311530]
; CHECK-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0
@@ -255,7 +255,7 @@ define <2 x i1> @t3_wide(<2 x i64> %X) nounwind {
; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
; CHECK-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3
; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
-; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2863311530,2863311530]
; CHECK-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0
diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll
index 6174011..fce8795 100644
--- a/llvm/test/CodeGen/X86/var-permute-128.ll
+++ b/llvm/test/CodeGen/X86/var-permute-128.ll
@@ -5,9 +5,9 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,XOP
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512,AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512VL,AVX512VLBW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVX512VL,VLVBMI
@@ -241,7 +241,7 @@ define <4 x i32> @var_shuffle_v4i32(<4 x i32> %v, <4 x i32> %indices) nounwind {
;
; SSE41-LABEL: var_shuffle_v4i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [67372036,67372036,67372036,67372036]
; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE41-NEXT: pshufb %xmm1, %xmm0
; SSE41-NEXT: retq
@@ -319,7 +319,7 @@ define <4 x i32> @var_shuffle_zero_v4i32(<4 x i32> %v, <4 x i32> %indices) nounw
; SSE41-NEXT: pmaxud %xmm1, %xmm2
; SSE41-NEXT: pcmpeqd %xmm1, %xmm2
; SSE41-NEXT: por %xmm2, %xmm1
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [67372036,67372036,67372036,67372036]
; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE41-NEXT: por %xmm2, %xmm1
; SSE41-NEXT: pshufb %xmm1, %xmm0
@@ -598,6 +598,33 @@ define <8 x i16> @var_shuffle_zero_v8i16(<8 x i16> %v, <8 x i16> %indices) nounw
; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
+; AVX512F-LABEL: var_shuffle_zero_v8i16:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpmaxuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2
+; AVX512F-NEXT: vpor %xmm1, %xmm2, %xmm1
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [514,514,514,514,514,514,514,514]
+; AVX512F-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512F-NEXT: vpor %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: var_shuffle_zero_v8i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT: vpcmpnleuw %zmm2, %zmm1, %k1
+; AVX512BW-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm1 {%k1}
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [514,514,514,514,514,514,514,514]
+; AVX512BW-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
; AVX512VL-LABEL: var_shuffle_zero_v8i16:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
@@ -921,6 +948,28 @@ define <16 x i8> @var_shuffle_zero_v16i8(<16 x i8> %v, <16 x i8> %indices) nounw
; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
+; AVX512F-LABEL: var_shuffle_zero_v16i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2
+; AVX512F-NEXT: vpor %xmm1, %xmm2, %xmm1
+; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: var_shuffle_zero_v16i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpcmpnleub %zmm2, %zmm1, %k1
+; AVX512BW-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm1 {%k1}
+; AVX512BW-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
; AVX512VL-LABEL: var_shuffle_zero_v16i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
@@ -1212,7 +1261,7 @@ define <4 x float> @var_shuffle_v4f32(<4 x float> %v, <4 x i32> %indices) nounwi
;
; SSE41-LABEL: var_shuffle_v4f32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [67372036,67372036,67372036,67372036]
; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE41-NEXT: pshufb %xmm1, %xmm0
; SSE41-NEXT: retq
@@ -1290,7 +1339,7 @@ define <4 x float> @var_shuffle_zero_v4f32(<4 x float> %v, <4 x i32> %indices) n
; SSE41-NEXT: pmaxud %xmm1, %xmm2
; SSE41-NEXT: pcmpeqd %xmm1, %xmm2
; SSE41-NEXT: por %xmm2, %xmm1
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [67372036,67372036,67372036,67372036]
; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE41-NEXT: por %xmm2, %xmm1
; SSE41-NEXT: pshufb %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vec_reassociate.ll b/llvm/test/CodeGen/X86/vec_reassociate.ll
index a9473fff..4703ca3 100644
--- a/llvm/test/CodeGen/X86/vec_reassociate.ll
+++ b/llvm/test/CodeGen/X86/vec_reassociate.ll
@@ -38,13 +38,13 @@ define <4 x i32> @mul_4i32(<4 x i32> %a0, <4 x i32> %a1) {
; X86-LABEL: mul_4i32:
; X86: # %bb.0:
; X86-NEXT: pmulld %xmm1, %xmm0
-; X86-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [4,6,6,4]
; X86-NEXT: retl
;
; X64-LABEL: mul_4i32:
; X64: # %bb.0:
; X64-NEXT: pmulld %xmm1, %xmm0
-; X64-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4,6,6,4]
; X64-NEXT: retq
%1 = mul <4 x i32> %a0, <i32 1, i32 2, i32 3, i32 4>
%2 = mul <4 x i32> %a1, <i32 4, i32 3, i32 2, i32 1>
@@ -56,13 +56,13 @@ define <4 x i32> @mul_4i32_commute(<4 x i32> %a0, <4 x i32> %a1) {
; X86-LABEL: mul_4i32_commute:
; X86: # %bb.0:
; X86-NEXT: pmulld %xmm1, %xmm0
-; X86-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [4,6,6,4]
; X86-NEXT: retl
;
; X64-LABEL: mul_4i32_commute:
; X64: # %bb.0:
; X64-NEXT: pmulld %xmm1, %xmm0
-; X64-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4,6,6,4]
; X64-NEXT: retq
%1 = mul <4 x i32> <i32 1, i32 2, i32 3, i32 4>, %a0
%2 = mul <4 x i32> <i32 4, i32 3, i32 2, i32 1>, %a1
diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll
index 762900e..a0c2760 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll
@@ -1821,9 +1821,9 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,64,128]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [32,u,128,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE2-NEXT: por %xmm1, %xmm0
@@ -1841,7 +1841,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
; SSE41-NEXT: psrld $28, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7]
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,64,128]
; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: retq
;
@@ -1854,7 +1854,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
; AVX1-NEXT: vpsrld $28, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,64,128]
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
@@ -1935,9 +1935,9 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,64,128]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [32,u,128,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; X86-SSE2-NEXT: por %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll
index 445e572..2fadf5f 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll
@@ -1647,7 +1647,7 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [256,512,1024,2048]
; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpsrld $25, %xmm1, %xmm3
; AVX1-NEXT: vpsrld $27, %xmm1, %xmm4
@@ -1656,7 +1656,7 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
; AVX1-NEXT: vpsrld $28, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,64,128]
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
index d0690bd..ec2efcd 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
@@ -1302,9 +1302,9 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind {
; SSE2-LABEL: constant_funnnel_v4i32:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,64,128]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,u,128,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1316,8 +1316,8 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind {
; SSE41-LABEL: constant_funnnel_v4i32:
; SSE41: # %bb.0:
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,u,128,u]
+; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,64,128]
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1328,8 +1328,8 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind {
; AVX1-LABEL: constant_funnnel_v4i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,u,128,u]
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,64,128]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1394,9 +1394,9 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind {
; X86-SSE2-LABEL: constant_funnnel_v4i32:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,64,128]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [32,u,128,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
index 421fa98..5f7e407 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
@@ -1082,13 +1082,13 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x) nounwind {
; AVX1-LABEL: constant_funnnel_v8i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,u,128,u]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [512,u,2048,u]
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,64,128]
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [256,512,1024,2048]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vmovshdup {{.*#+}} ymm2 = ymm0[1,1,3,3,5,5,7,7]
; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
index b378dce..304daab 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
@@ -319,9 +319,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; SSE2-LABEL: constant_funnnel_v2i32:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,1,1]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,u,1,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -333,8 +333,8 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; SSE41-LABEL: constant_funnnel_v2i32:
; SSE41: # %bb.0:
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,u,1,u]
+; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,1,1]
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -345,8 +345,8 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; AVX1-LABEL: constant_funnnel_v2i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,u,1,u]
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,1,1]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -411,9 +411,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; X86-SSE2-LABEL: constant_funnnel_v2i32:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,1,1]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [32,u,1,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
diff --git a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll
index 06ff7e7..ae5dd18 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll
@@ -500,9 +500,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
; SSE2-NEXT: psrld $27, %xmm2
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,1,1]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [32,u,1,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE2-NEXT: por %xmm1, %xmm0
@@ -514,7 +514,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
; SSE41-NEXT: psrld $27, %xmm2
; SSE41-NEXT: psrld $28, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,1,1]
; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: retq
;
@@ -523,7 +523,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
; AVX1-NEXT: vpsrld $27, %xmm1, %xmm2
; AVX1-NEXT: vpsrld $28, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,1,1]
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
@@ -598,9 +598,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
; X86-SSE2-NEXT: psrld $27, %xmm2
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,1,1]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [32,u,1,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; X86-SSE2-NEXT: por %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index 9b52857..33a6a76 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -1741,9 +1741,9 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,67108864,33554432]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [134217728,u,33554432,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE2-NEXT: por %xmm1, %xmm0
@@ -1761,7 +1761,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
; SSE41-NEXT: psrld $4, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7]
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,67108864,33554432]
; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: retq
;
@@ -1774,7 +1774,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
; AVX1-NEXT: vpsrld $4, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [268435456,134217728,67108864,33554432]
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
@@ -1856,9 +1856,9 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [268435456,134217728,67108864,33554432]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [134217728,u,33554432,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; X86-SSE2-NEXT: por %xmm1, %xmm0
@@ -1872,7 +1872,7 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,0,0,128,0,64,0,32,0,16,0,8,0,4,0,2]
; SSE2-NEXT: por %xmm1, %xmm2
; SSE2-NEXT: paddw %xmm0, %xmm0
; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,16384,8192,4096,2048,1024,512,256]
@@ -1964,7 +1964,7 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
; X86-SSE2-NEXT: pandn %xmm1, %xmm2
-; X86-SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [0,0,0,128,0,64,0,32,0,16,0,8,0,4,0,2]
; X86-SSE2-NEXT: por %xmm1, %xmm2
; X86-SSE2-NEXT: paddw %xmm0, %xmm0
; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [32768,16384,8192,4096,2048,1024,512,256]
diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll
index a387562..217431be 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll
@@ -1403,7 +1403,7 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [16777216,8388608,4194304,2097152]
; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpsrld $7, %xmm1, %xmm3
; AVX1-NEXT: vpsrld $5, %xmm1, %xmm4
@@ -1412,7 +1412,7 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
; AVX1-NEXT: vpsrld $4, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [268435456,134217728,67108864,33554432]
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
index 4969cb5..5d01dfd 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
@@ -1380,9 +1380,9 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind {
; SSE2-LABEL: constant_funnnel_v4i32:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,67108864,33554432]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,u,33554432,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1394,8 +1394,8 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind {
; SSE41-LABEL: constant_funnnel_v4i32:
; SSE41: # %bb.0:
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,u,33554432,u]
+; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,67108864,33554432]
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1406,8 +1406,8 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind {
; AVX1-LABEL: constant_funnnel_v4i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [134217728,u,33554432,u]
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [268435456,134217728,67108864,33554432]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1472,9 +1472,9 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind {
; X86-SSE2-LABEL: constant_funnnel_v4i32:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [268435456,134217728,67108864,33554432]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [134217728,u,33554432,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
index e2a3e26..4dc931d 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
@@ -1134,13 +1134,13 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x) nounwind {
; AVX1-LABEL: constant_funnnel_v8i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [134217728,u,33554432,u]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [8388608,u,2097152,u]
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [268435456,134217728,67108864,33554432]
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [16777216,8388608,4194304,2097152]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vmovshdup {{.*#+}} ymm2 = ymm0[1,1,3,3,5,5,7,7]
; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll
index ef5ffe4..4b42b18 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll
@@ -341,9 +341,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; SSE2-LABEL: constant_funnnel_v2i32:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,1,1]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,u,1,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -355,8 +355,8 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; SSE41-LABEL: constant_funnnel_v2i32:
; SSE41: # %bb.0:
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,u,1,u]
+; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,1,1]
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -367,8 +367,8 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; AVX1-LABEL: constant_funnnel_v2i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [134217728,u,1,u]
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [268435456,134217728,1,1]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -433,9 +433,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; X86-SSE2-LABEL: constant_funnnel_v2i32:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [268435456,134217728,1,1]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [134217728,u,1,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
index 816d5ca..e68d1d7 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
@@ -171,7 +171,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: pxor %xmm2, %xmm2
; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE-NEXT: movdqa {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632]
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147]
; SSE-NEXT: pmulhw %xmm3, %xmm2
; SSE-NEXT: psrlw $8, %xmm2
; SSE-NEXT: pxor %xmm4, %xmm4
@@ -193,7 +193,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147]
; AVX1-NEXT: vpmulhw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
@@ -260,11 +260,11 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: pxor %xmm2, %xmm2
; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [37632,20224,11008,47872,26368,14592,14592,37632]
+; SSE-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,147,0,79,0,43,0,187,0,103,0,57,0,57,0,147]
; SSE-NEXT: psrlw $8, %xmm2
; SSE-NEXT: pxor %xmm3, %xmm3
; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37632,33024,14592,26368,47872,11008,20224,37632]
+; SSE-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [0,147,0,129,0,57,0,103,0,187,0,43,0,79,0,147]
; SSE-NEXT: psrlw $8, %xmm3
; SSE-NEXT: packuswb %xmm2, %xmm3
; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -287,10 +287,10 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [37632,20224,11008,47872,26368,14592,14592,37632]
+; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,147,0,79,0,43,0,187,0,103,0,57,0,57,0,147]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37632,33024,14592,26368,47872,11008,20224,37632]
+; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,147,0,129,0,57,0,103,0,187,0,43,0,79,0,147]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -561,7 +561,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: pxor %xmm2, %xmm2
; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE-NEXT: movdqa {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632]
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147]
; SSE-NEXT: pmulhw %xmm3, %xmm2
; SSE-NEXT: psrlw $8, %xmm2
; SSE-NEXT: pxor %xmm4, %xmm4
@@ -588,7 +588,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147]
; AVX1-NEXT: vpmulhw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
@@ -667,11 +667,11 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [37632,20224,11008,47872,26368,14592,14592,37632]
+; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,147,0,79,0,43,0,187,0,103,0,57,0,57,0,147]
; SSE2-NEXT: psrlw $8, %xmm1
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37632,33024,14592,26368,47872,11008,20224,37632]
+; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [0,147,0,129,0,57,0,103,0,187,0,43,0,79,0,147]
; SSE2-NEXT: psrlw $8, %xmm3
; SSE2-NEXT: packuswb %xmm1, %xmm3
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,0,255,0,0,255,255,0,0,255,0,0,0,255]
@@ -706,11 +706,11 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pxor %xmm2, %xmm2
; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [37632,20224,11008,47872,26368,14592,14592,37632]
+; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,147,0,79,0,43,0,187,0,103,0,57,0,57,0,147]
; SSE41-NEXT: psrlw $8, %xmm2
; SSE41-NEXT: pxor %xmm3, %xmm3
; SSE41-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37632,33024,14592,26368,47872,11008,20224,37632]
+; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [0,147,0,129,0,57,0,103,0,187,0,43,0,79,0,147]
; SSE41-NEXT: psrlw $8, %xmm3
; SSE41-NEXT: packuswb %xmm2, %xmm3
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,0,255,0,0,255,255,0,0,255,0,0,0,255]
@@ -741,10 +741,10 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [37632,20224,11008,47872,26368,14592,14592,37632]
+; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,147,0,79,0,43,0,187,0,103,0,57,0,57,0,147]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37632,33024,14592,26368,47872,11008,20224,37632]
+; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,147,0,129,0,57,0,103,0,187,0,43,0,79,0,147]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
@@ -1116,11 +1116,11 @@ define <16 x i8> @PR143238(<16 x i8> %a0) {
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: pxor %xmm2, %xmm2
; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [26368,47872,11008,20224,37632,35072,33024,30976]
+; SSE-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,103,0,187,0,43,0,79,0,147,0,137,0,129,0,121]
; SSE-NEXT: psrlw $8, %xmm2
; SSE-NEXT: pxor %xmm3, %xmm3
; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [33024,22016,33024,26368,11008,37632,33024,14592]
+; SSE-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [0,129,0,86,0,129,0,103,0,43,0,147,0,129,0,57]
; SSE-NEXT: psrlw $8, %xmm3
; SSE-NEXT: packuswb %xmm2, %xmm3
; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1144,10 +1144,10 @@ define <16 x i8> @PR143238(<16 x i8> %a0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [26368,47872,11008,20224,37632,35072,33024,30976]
+; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,103,0,187,0,43,0,79,0,147,0,137,0,129,0,121]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [33024,22016,33024,26368,11008,37632,33024,14592]
+; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,129,0,86,0,129,0,103,0,43,0,147,0,129,0,57]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
index 63c69e5..7355f36 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
@@ -161,7 +161,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147]
; AVX1-NEXT: vpmulhw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
@@ -198,7 +198,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
; AVX2NOBW: # %bb.0:
; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147]
; AVX2NOBW-NEXT: vpmulhw %ymm3, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
@@ -245,10 +245,10 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37632,20224,11008,47872,26368,14592,33024,37632]
+; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,147,0,79,0,43,0,187,0,103,0,57,0,129,0,147]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [47872,12544,26368,6912,14592,30976,33024,35072]
+; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,187,0,49,0,103,0,27,0,57,0,121,0,129,0,137]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
@@ -266,10 +266,10 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpcmpgtb %xmm2, %xmm1, %xmm4
; AVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [35072,33024,30976,14592,6912,26368,12544,47872]
+; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,137,0,129,0,121,0,57,0,27,0,103,0,49,0,187]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [37632,33024,14592,26368,47872,11008,20224,37632]
+; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,147,0,129,0,57,0,103,0,187,0,43,0,79,0,147]
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
@@ -291,10 +291,10 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind {
; AVX2NOBW: # %bb.0:
; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX2NOBW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [35072,33024,30976,14592,6912,26368,12544,47872,37632,20224,11008,47872,26368,14592,33024,37632]
+; AVX2NOBW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [0,137,0,129,0,121,0,57,0,27,0,103,0,49,0,187,0,147,0,79,0,43,0,187,0,103,0,57,0,129,0,147]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX2NOBW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [37632,33024,14592,26368,47872,11008,20224,37632,47872,12544,26368,6912,14592,30976,33024,35072]
+; AVX2NOBW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,147,0,129,0,57,0,103,0,187,0,43,0,79,0,147,0,187,0,49,0,103,0,27,0,57,0,121,0,129,0,137]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX2NOBW-NEXT: vpackuswb %ymm2, %ymm3, %ymm2
; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
@@ -539,7 +539,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147]
; AVX1-NEXT: vpmulhw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
@@ -585,7 +585,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
; AVX2NOBW: # %bb.0:
; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147]
; AVX2NOBW-NEXT: vpmulhw %ymm3, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
@@ -640,10 +640,10 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37632,20224,11008,47872,26368,14592,33024,37632]
+; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,147,0,79,0,43,0,187,0,103,0,57,0,129,0,147]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [47872,12544,26368,6912,14592,30976,33024,35072]
+; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,187,0,49,0,103,0,27,0,57,0,121,0,129,0,137]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4
@@ -668,10 +668,10 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3
; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [35072,33024,30976,14592,6912,26368,12544,47872]
+; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,137,0,129,0,121,0,57,0,27,0,103,0,49,0,187]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [37632,33024,14592,26368,47872,11008,20224,37632]
+; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [0,147,0,129,0,57,0,103,0,187,0,43,0,79,0,147]
; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
; AVX1-NEXT: vpackuswb %xmm3, %xmm5, %xmm3
; AVX1-NEXT: vpaddb %xmm4, %xmm3, %xmm3
@@ -699,10 +699,10 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
; AVX2NOBW: # %bb.0:
; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX2NOBW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [35072,33024,30976,14592,6912,26368,12544,47872,37632,20224,11008,47872,26368,14592,33024,37632]
+; AVX2NOBW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [0,137,0,129,0,121,0,57,0,27,0,103,0,49,0,187,0,147,0,79,0,43,0,187,0,103,0,57,0,129,0,147]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX2NOBW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [37632,33024,14592,26368,47872,11008,20224,37632,47872,12544,26368,6912,14592,30976,33024,35072]
+; AVX2NOBW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,147,0,129,0,57,0,103,0,187,0,43,0,79,0,147,0,187,0,49,0,103,0,27,0,57,0,121,0,129,0,137]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX2NOBW-NEXT: vpackuswb %ymm2, %ymm3, %ymm2
; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
index 6bc4fcb..5445330 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
@@ -132,7 +132,7 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31]
-; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147]
; AVX512F-NEXT: vpmulhw %ymm4, %ymm3, %ymm3
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
@@ -169,7 +169,7 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147]
; AVX512BW-NEXT: vpmulhw %zmm3, %zmm2, %zmm2
; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
@@ -199,10 +199,10 @@ define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
-; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [35072,18176,37632,4864,20224,10496,11008,45824,37632,20224,11008,47872,26368,14592,33024,37632]
+; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,137,0,71,0,147,0,19,0,79,0,41,0,43,0,179,0,147,0,79,0,43,0,187,0,103,0,57,0,129,0,147]
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
-; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [6912,28416,14592,15104,30976,32000,33024,34048,47872,12544,26368,6912,14592,30976,33024,35072]
+; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [0,27,0,111,0,57,0,59,0,121,0,125,0,129,0,133,0,187,0,49,0,103,0,27,0,57,0,121,0,129,0,137]
; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX512F-NEXT: vpackuswb %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3
@@ -220,10 +220,10 @@ define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm4
; AVX512F-NEXT: vpsubb %ymm4, %ymm2, %ymm2
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [35072,33024,30976,14592,6912,26368,12544,47872,34048,33024,32000,30976,15104,14592,28416,6912]
+; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,137,0,129,0,121,0,57,0,27,0,103,0,49,0,187,0,133,0,129,0,125,0,121,0,59,0,57,0,111,0,27]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [37632,33024,14592,26368,47872,11008,20224,37632,45824,11008,10496,20224,4864,37632,18176,35072]
+; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,147,0,129,0,57,0,103,0,187,0,43,0,79,0,147,0,179,0,43,0,41,0,79,0,19,0,147,0,71,0,137]
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX512F-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0
@@ -245,10 +245,10 @@ define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
-; AVX512BW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 # [35072,33024,30976,14592,6912,26368,12544,47872,34048,33024,32000,30976,15104,14592,28416,6912,35072,18176,37632,4864,20224,10496,11008,45824,37632,20224,11008,47872,26368,14592,33024,37632]
+; AVX512BW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 # [0,137,0,129,0,121,0,57,0,27,0,103,0,49,0,187,0,133,0,129,0,125,0,121,0,59,0,57,0,111,0,27,0,137,0,71,0,147,0,19,0,79,0,41,0,43,0,179,0,147,0,79,0,43,0,187,0,103,0,57,0,129,0,147]
; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
-; AVX512BW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [37632,33024,14592,26368,47872,11008,20224,37632,45824,11008,10496,20224,4864,37632,18176,35072,6912,28416,14592,15104,30976,32000,33024,34048,47872,12544,26368,6912,14592,30976,33024,35072]
+; AVX512BW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [0,147,0,129,0,57,0,103,0,187,0,43,0,79,0,147,0,179,0,43,0,41,0,79,0,19,0,147,0,71,0,137,0,27,0,111,0,57,0,59,0,121,0,125,0,129,0,133,0,187,0,49,0,103,0,27,0,57,0,121,0,129,0,137]
; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
; AVX512BW-NEXT: vpackuswb %zmm2, %zmm1, %zmm1
; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
@@ -444,7 +444,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
-; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147]
; AVX512F-NEXT: vpmulhw %ymm4, %ymm3, %ymm3
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
@@ -490,7 +490,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147]
; AVX512BW-NEXT: vpmulhw %zmm3, %zmm2, %zmm2
; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
@@ -524,10 +524,10 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
-; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [35072,18176,37632,4864,20224,10496,11008,45824,37632,20224,11008,47872,26368,14592,33024,37632]
+; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,137,0,71,0,147,0,19,0,79,0,41,0,43,0,179,0,147,0,79,0,43,0,187,0,103,0,57,0,129,0,147]
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
-; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [6912,28416,14592,15104,30976,32000,33024,34048,47872,12544,26368,6912,14592,30976,33024,35072]
+; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,27,0,111,0,57,0,59,0,121,0,125,0,129,0,133,0,187,0,49,0,103,0,27,0,57,0,121,0,129,0,137]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm4
@@ -552,10 +552,10 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vpor %ymm3, %ymm5, %ymm3
; AVX512F-NEXT: vpsubb %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [35072,33024,30976,14592,6912,26368,12544,47872,34048,33024,32000,30976,15104,14592,28416,6912]
+; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,137,0,129,0,121,0,57,0,27,0,103,0,49,0,187,0,133,0,129,0,125,0,121,0,59,0,57,0,111,0,27]
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [37632,33024,14592,26368,47872,11008,20224,37632,45824,11008,10496,20224,4864,37632,18176,35072]
+; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [0,147,0,129,0,57,0,103,0,187,0,43,0,79,0,147,0,179,0,43,0,41,0,79,0,19,0,147,0,71,0,137]
; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
; AVX512F-NEXT: vpackuswb %ymm3, %ymm5, %ymm3
; AVX512F-NEXT: vpaddb %ymm4, %ymm3, %ymm3
@@ -583,10 +583,10 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
-; AVX512BW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 # [35072,33024,30976,14592,6912,26368,12544,47872,34048,33024,32000,30976,15104,14592,28416,6912,35072,18176,37632,4864,20224,10496,11008,45824,37632,20224,11008,47872,26368,14592,33024,37632]
+; AVX512BW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 # [0,137,0,129,0,121,0,57,0,27,0,103,0,49,0,187,0,133,0,129,0,125,0,121,0,59,0,57,0,111,0,27,0,137,0,71,0,147,0,19,0,79,0,41,0,43,0,179,0,147,0,79,0,43,0,187,0,103,0,57,0,129,0,147]
; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
-; AVX512BW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [37632,33024,14592,26368,47872,11008,20224,37632,45824,11008,10496,20224,4864,37632,18176,35072,6912,28416,14592,15104,30976,32000,33024,34048,47872,12544,26368,6912,14592,30976,33024,35072]
+; AVX512BW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [0,147,0,129,0,57,0,103,0,187,0,43,0,79,0,147,0,179,0,43,0,41,0,79,0,19,0,147,0,71,0,137,0,27,0,111,0,57,0,59,0,121,0,125,0,129,0,133,0,187,0,49,0,103,0,27,0,57,0,121,0,129,0,137]
; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
; AVX512BW-NEXT: vpackuswb %zmm2, %zmm1, %zmm1
; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
index 33d80f6..6cd5098 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
@@ -169,7 +169,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [37,37,37,37,37,37,37,37]
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
; SSE2-NEXT: pmullw %xmm3, %xmm2
; SSE2-NEXT: psrlw $8, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm4
@@ -209,7 +209,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [37,37,37,37,37,37,37,37]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -270,22 +270,22 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,256,256,256,256,256,256,256]
; SSE2-NEXT: psrlw $8, %xmm2
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [147,79,171,117,205,57,57,37]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [147,0,79,0,171,0,117,0,205,0,57,0,57,0,37,0]
; SSE2-NEXT: psrlw $8, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [256,256,256,256,256,256,256,128]
; SSE2-NEXT: psrlw $8, %xmm3
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37,32,57,205,117,171,79,147]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0]
; SSE2-NEXT: psrlw $8, %xmm3
; SSE2-NEXT: packuswb %xmm2, %xmm3
; SSE2-NEXT: psubb %xmm3, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,128,0,0,0,128]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
; SSE2-NEXT: psrlw $8, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,0,0,0,128,0,0,0]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0]
; SSE2-NEXT: psrlw $8, %xmm0
; SSE2-NEXT: packuswb %xmm2, %xmm0
; SSE2-NEXT: paddb %xmm3, %xmm0
@@ -309,7 +309,7 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
; SSE41-NEXT: psllw $7, %xmm3
; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1,2,3,4,5,6,7]
; SSE41-NEXT: psrlw $8, %xmm3
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [147,79,171,117,205,57,57,37]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [147,0,79,0,171,0,117,0,205,0,57,0,57,0,37,0]
; SSE41-NEXT: psrlw $8, %xmm3
; SSE41-NEXT: pxor %xmm2, %xmm2
; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
@@ -317,15 +317,15 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
; SSE41-NEXT: psllw $7, %xmm4
; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,6],xmm4[7]
; SSE41-NEXT: psrlw $8, %xmm4
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [37,32,57,205,117,171,79,147]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0]
; SSE41-NEXT: psrlw $8, %xmm4
; SSE41-NEXT: packuswb %xmm3, %xmm4
; SSE41-NEXT: psubb %xmm4, %xmm0
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,128,0,0,0,128]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,0,0,0,128,0,0,0]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0]
; SSE41-NEXT: psrlw $8, %xmm2
; SSE41-NEXT: packuswb %xmm0, %xmm2
; SSE41-NEXT: paddb %xmm4, %xmm2
@@ -346,22 +346,22 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
; AVX1-NEXT: vpsllw $7, %xmm3, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [147,79,171,117,205,57,57,37]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [147,0,79,0,171,0,117,0,205,0,57,0,57,0,37,0]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37,32,57,205,117,171,79,147]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,0,0,128,0,0,0,128]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [128,0,0,0,128,0,0,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0]
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
@@ -638,7 +638,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [37,37,37,37,37,37,37,37]
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
; SSE2-NEXT: pmullw %xmm3, %xmm2
; SSE2-NEXT: psrlw $8, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm4
@@ -690,7 +690,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [37,37,37,37,37,37,37,37]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -763,23 +763,23 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,256,256,256,256,256,256,256]
; SSE2-NEXT: psrlw $8, %xmm2
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [147,79,171,117,205,57,57,37]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [147,0,79,0,171,0,117,0,205,0,57,0,57,0,37,0]
; SSE2-NEXT: psrlw $8, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [256,256,256,256,256,256,256,128]
; SSE2-NEXT: psrlw $8, %xmm3
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37,32,57,205,117,171,79,147]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0]
; SSE2-NEXT: psrlw $8, %xmm3
; SSE2-NEXT: packuswb %xmm2, %xmm3
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psubb %xmm3, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [0,0,0,128,0,0,0,128]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
; SSE2-NEXT: psrlw $8, %xmm4
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,0,0,0,128,0,0,0]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0]
; SSE2-NEXT: psrlw $8, %xmm2
; SSE2-NEXT: packuswb %xmm4, %xmm2
; SSE2-NEXT: paddb %xmm3, %xmm2
@@ -809,7 +809,7 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
; SSE41-NEXT: psllw $7, %xmm3
; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1,2,3,4,5,6,7]
; SSE41-NEXT: psrlw $8, %xmm3
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [147,79,171,117,205,57,57,37]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [147,0,79,0,171,0,117,0,205,0,57,0,57,0,37,0]
; SSE41-NEXT: psrlw $8, %xmm3
; SSE41-NEXT: pxor %xmm2, %xmm2
; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
@@ -817,16 +817,16 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
; SSE41-NEXT: psllw $7, %xmm4
; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,6],xmm4[7]
; SSE41-NEXT: psrlw $8, %xmm4
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [37,32,57,205,117,171,79,147]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0]
; SSE41-NEXT: psrlw $8, %xmm4
; SSE41-NEXT: packuswb %xmm3, %xmm4
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: psubb %xmm4, %xmm2
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,128,0,0,0,128]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
; SSE41-NEXT: psrlw $8, %xmm2
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [128,0,0,0,128,0,0,0]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0]
; SSE41-NEXT: psrlw $8, %xmm3
; SSE41-NEXT: packuswb %xmm2, %xmm3
; SSE41-NEXT: paddb %xmm4, %xmm3
@@ -854,22 +854,22 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
; AVX1-NEXT: vpsllw $7, %xmm3, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [147,79,171,117,205,57,57,37]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [147,0,79,0,171,0,117,0,205,0,57,0,57,0,37,0]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37,32,57,205,117,171,79,147]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm3
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,0,0,128,0,0,0,128]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [128,0,0,0,128,0,0,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
index e43108f..98ea87c 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
@@ -166,7 +166,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [37,37,37,37,37,37,37,37]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -200,7 +200,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
; AVX2NOBW: # %bb.0:
; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
@@ -246,22 +246,22 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3,4,5,6,7]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [147,79,171,117,205,57,32,37]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [147,0,79,0,171,0,117,0,205,0,57,0,32,0,37,0]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX1-NEXT: vpsllw $7, %xmm5, %xmm5
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4,5,6,7]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [187,135,205,27,57,241,16,137]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [187,0,135,0,205,0,27,0,57,0,241,0,16,0,137,0]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,0,0,128,0,0,0,128]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,128,0,0,0,0,0,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2
@@ -276,22 +276,22 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm4
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [137,16,241,57,27,205,135,187]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [137,0,16,0,241,0,57,0,27,0,205,0,135,0,187,0]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpsllw $7, %xmm5, %xmm5
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [37,32,57,205,117,171,79,147]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,0,0,0,0,0,128,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [128,0,0,0,128,0,0,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0]
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
@@ -312,20 +312,20 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind {
; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [256,256,256,256,256,256,256,128,128,256,256,256,256,256,256,256]
; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [137,16,241,57,27,205,135,187,147,79,171,117,205,57,32,37]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [137,0,16,0,241,0,57,0,27,0,205,0,135,0,187,0,147,0,79,0,171,0,117,0,205,0,57,0,32,0,37,0]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm4, %ymm3
; AVX2NOBW-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [37,32,57,205,117,171,79,147,187,135,205,27,57,241,16,137]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0,187,0,135,0,205,0,27,0,57,0,241,0,16,0,137,0]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX2NOBW-NEXT: vpackuswb %ymm2, %ymm3, %ymm2
; AVX2NOBW-NEXT: vpsubb %ymm2, %ymm0, %ymm0
; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,0,0,0,0,0,128,0,0,0,0,128,0,0,0,128]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,0,0,0,128,0,0,0,0,128,0,0,0,0,0,0]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX2NOBW-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
; AVX2NOBW-NEXT: vpaddb %ymm2, %ymm0, %ymm0
@@ -578,7 +578,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [37,37,37,37,37,37,37,37]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
@@ -622,7 +622,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
; AVX2NOBW: # %bb.0:
; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
@@ -676,22 +676,22 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3,4,5,6,7]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [147,79,171,117,205,57,32,37]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [147,0,79,0,171,0,117,0,205,0,57,0,32,0,37,0]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX1-NEXT: vpsllw $7, %xmm5, %xmm5
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4,5,6,7]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [187,135,205,27,57,241,16,137]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [187,0,135,0,205,0,27,0,57,0,241,0,16,0,137,0]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [0,0,0,128,0,0,0,128]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,128,0,0,0,0,0,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm3
@@ -713,22 +713,22 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm5
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [137,16,241,57,27,205,135,187]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [137,0,16,0,241,0,57,0,27,0,205,0,135,0,187,0]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpsllw $7, %xmm6, %xmm6
; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6],xmm6[7]
; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [37,32,57,205,117,171,79,147]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0]
; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
; AVX1-NEXT: vpackuswb %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpsubb %xmm4, %xmm0, %xmm5
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 # [0,0,0,0,0,0,128,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 # [0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0]
; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [128,0,0,0,128,0,0,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0]
; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
; AVX1-NEXT: vpackuswb %xmm6, %xmm5, %xmm5
; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4
@@ -755,20 +755,20 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [256,256,256,256,256,256,256,128,128,256,256,256,256,256,256,256]
; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [137,16,241,57,27,205,135,187,147,79,171,117,205,57,32,37]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [137,0,16,0,241,0,57,0,27,0,205,0,135,0,187,0,147,0,79,0,171,0,117,0,205,0,57,0,32,0,37,0]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm4, %ymm3
; AVX2NOBW-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [37,32,57,205,117,171,79,147,187,135,205,27,57,241,16,137]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0,187,0,135,0,205,0,27,0,57,0,241,0,16,0,137,0]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX2NOBW-NEXT: vpackuswb %ymm2, %ymm3, %ymm2
; AVX2NOBW-NEXT: vpsubb %ymm2, %ymm0, %ymm3
; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15],ymm3[24],ymm1[24],ymm3[25],ymm1[25],ymm3[26],ymm1[26],ymm3[27],ymm1[27],ymm3[28],ymm1[28],ymm3[29],ymm1[29],ymm3[30],ymm1[30],ymm3[31],ymm1[31]
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,0,0,0,0,0,128,0,0,0,0,128,0,0,0,128]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[16],ymm1[16],ymm3[17],ymm1[17],ymm3[18],ymm1[18],ymm3[19],ymm1[19],ymm3[20],ymm1[20],ymm3[21],ymm1[21],ymm3[22],ymm1[22],ymm3[23],ymm1[23]
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [128,0,0,0,128,0,0,0,0,128,0,0,0,0,0,0]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX2NOBW-NEXT: vpackuswb %ymm4, %ymm3, %ymm3
; AVX2NOBW-NEXT: vpaddb %ymm2, %ymm3, %ymm2
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
index bf98bcc..a11fa370 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
@@ -135,7 +135,7 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
@@ -169,7 +169,7 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
; AVX512BW-NEXT: vpmullw %zmm3, %zmm2, %zmm2
; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
@@ -199,20 +199,20 @@ define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31]
; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [256,256,64,256,256,256,256,256,128,256,256,256,256,256,256,256]
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [137,27,37,19,79,41,171,101,147,79,171,117,205,57,32,37]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [137,0,27,0,37,0,19,0,79,0,41,0,171,0,101,0,147,0,79,0,171,0,117,0,205,0,57,0,32,0,37,0]
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [256,256,256,256,256,256,256,256,128,256,256,256,256,256,256,256]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [27,111,57,235,241,249,8,9,187,135,205,27,57,241,16,137]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [27,0,111,0,57,0,235,0,241,0,249,0,8,0,9,0,187,0,135,0,205,0,27,0,57,0,241,0,16,0,137,0]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsubb %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,128,0,0,0,0,0,128,0,0,0,128,0,0,0,128]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,0,128,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [0,0,0,0,0,0,0,128,0,128,0,0,0,0,0,0]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX512F-NEXT: vpackuswb %ymm4, %ymm2, %ymm2
; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2
@@ -226,20 +226,20 @@ define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [256,256,256,256,256,256,256,128,256,256,256,256,256,256,256,256]
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [137,16,241,57,27,205,135,187,9,8,249,241,235,57,111,27]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [137,0,16,0,241,0,57,0,27,0,205,0,135,0,187,0,9,0,8,0,249,0,241,0,235,0,57,0,111,0,27,0]
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [256,256,256,256,256,256,256,128,256,256,256,256,256,64,256,256]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [37,32,57,205,117,171,79,147,101,171,41,79,19,37,27,137]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0,101,0,171,0,41,0,79,0,19,0,37,0,27,0,137,0]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsubb %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,0,0,0,0,0,128,0,128,0,0,0,0,0,0,0]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,0,0,0,128,0,0,0,128,0,0,0,0,0,128,0]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0]
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX512F-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0
@@ -259,20 +259,20 @@ define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind {
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 # [137,16,241,57,27,205,135,187,9,8,249,241,235,57,111,27,137,27,37,19,79,41,171,101,147,79,171,117,205,57,32,37]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 # [137,0,16,0,241,0,57,0,27,0,205,0,135,0,187,0,9,0,8,0,249,0,241,0,235,0,57,0,111,0,27,0,137,0,27,0,37,0,19,0,79,0,41,0,171,0,101,0,147,0,79,0,171,0,117,0,205,0,57,0,32,0,37,0]
; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [37,32,57,205,117,171,79,147,101,171,41,79,19,37,27,137,27,111,57,235,241,249,8,9,187,135,205,27,57,241,16,137]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0,101,0,171,0,41,0,79,0,19,0,37,0,27,0,137,0,27,0,111,0,57,0,235,0,241,0,249,0,8,0,9,0,187,0,135,0,205,0,27,0,57,0,241,0,16,0,137,0]
; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
; AVX512BW-NEXT: vpackuswb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsubb %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [0,0,0,0,0,0,128,0,128,0,0,0,0,0,0,0,0,128,0,0,0,0,0,128,0,0,0,128,0,0,0,128]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [128,0,0,0,128,0,0,0,128,0,0,0,0,0,128,0,0,0,0,0,0,0,0,128,0,128,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
; AVX512BW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
@@ -473,7 +473,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
-; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
@@ -517,7 +517,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
; AVX512BW-NEXT: vpmullw %zmm3, %zmm2, %zmm2
; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
@@ -551,20 +551,20 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31]
; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [256,256,64,256,256,256,256,256,128,256,256,256,256,256,256,256]
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [137,27,37,19,79,41,171,101,147,79,171,117,205,57,32,37]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [137,0,27,0,37,0,19,0,79,0,41,0,171,0,101,0,147,0,79,0,171,0,117,0,205,0,57,0,32,0,37,0]
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [256,256,256,256,256,256,256,256,128,256,256,256,256,256,256,256]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [27,111,57,235,241,249,8,9,187,135,205,27,57,241,16,137]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [27,0,111,0,57,0,235,0,241,0,249,0,8,0,9,0,187,0,135,0,205,0,27,0,57,0,241,0,16,0,137,0]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsubb %ymm3, %ymm2, %ymm4
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15],ymm4[24],ymm1[24],ymm4[25],ymm1[25],ymm4[26],ymm1[26],ymm4[27],ymm1[27],ymm4[28],ymm1[28],ymm4[29],ymm1[29],ymm4[30],ymm1[30],ymm4[31],ymm1[31]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [0,128,0,0,0,0,0,128,0,0,0,128,0,0,0,128]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [0,0,128,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[16],ymm1[16],ymm4[17],ymm1[17],ymm4[18],ymm1[18],ymm4[19],ymm1[19],ymm4[20],ymm1[20],ymm4[21],ymm1[21],ymm4[22],ymm1[22],ymm4[23],ymm1[23]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,0,0,0,0,0,0,128,0,128,0,0,0,0,0,0]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpackuswb %ymm5, %ymm4, %ymm4
; AVX512F-NEXT: vpaddb %ymm3, %ymm4, %ymm3
@@ -585,20 +585,20 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [256,256,256,256,256,256,256,128,256,256,256,256,256,256,256,256]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [137,16,241,57,27,205,135,187,9,8,249,241,235,57,111,27]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [137,0,16,0,241,0,57,0,27,0,205,0,135,0,187,0,9,0,8,0,249,0,241,0,235,0,57,0,111,0,27,0]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [256,256,256,256,256,256,256,128,256,256,256,256,256,64,256,256]
; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [37,32,57,205,117,171,79,147,101,171,41,79,19,37,27,137]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0,101,0,171,0,41,0,79,0,19,0,37,0,27,0,137,0]
; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
; AVX512F-NEXT: vpackuswb %ymm4, %ymm5, %ymm4
; AVX512F-NEXT: vpsubb %ymm4, %ymm0, %ymm5
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm5[8],ymm1[8],ymm5[9],ymm1[9],ymm5[10],ymm1[10],ymm5[11],ymm1[11],ymm5[12],ymm1[12],ymm5[13],ymm1[13],ymm5[14],ymm1[14],ymm5[15],ymm1[15],ymm5[24],ymm1[24],ymm5[25],ymm1[25],ymm5[26],ymm1[26],ymm5[27],ymm1[27],ymm5[28],ymm1[28],ymm5[29],ymm1[29],ymm5[30],ymm1[30],ymm5[31],ymm1[31]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 # [0,0,0,0,0,0,128,0,128,0,0,0,0,0,0,0]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 # [0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-NEXT: vpsrlw $8, %ymm6, %ymm6
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[4],ymm1[4],ymm5[5],ymm1[5],ymm5[6],ymm1[6],ymm5[7],ymm1[7],ymm5[16],ymm1[16],ymm5[17],ymm1[17],ymm5[18],ymm1[18],ymm5[19],ymm1[19],ymm5[20],ymm1[20],ymm5[21],ymm1[21],ymm5[22],ymm1[22],ymm5[23],ymm1[23]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [128,0,0,0,128,0,0,0,128,0,0,0,0,0,128,0]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0]
; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
; AVX512F-NEXT: vpackuswb %ymm6, %ymm5, %ymm5
; AVX512F-NEXT: vpaddb %ymm4, %ymm5, %ymm4
@@ -624,20 +624,20 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 # [137,16,241,57,27,205,135,187,9,8,249,241,235,57,111,27,137,27,37,19,79,41,171,101,147,79,171,117,205,57,32,37]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 # [137,0,16,0,241,0,57,0,27,0,205,0,135,0,187,0,9,0,8,0,249,0,241,0,235,0,57,0,111,0,27,0,137,0,27,0,37,0,19,0,79,0,41,0,171,0,101,0,147,0,79,0,171,0,117,0,205,0,57,0,32,0,37,0]
; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [37,32,57,205,117,171,79,147,101,171,41,79,19,37,27,137,27,111,57,235,241,249,8,9,187,135,205,27,57,241,16,137]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0,101,0,171,0,41,0,79,0,19,0,37,0,27,0,137,0,27,0,111,0,57,0,235,0,241,0,249,0,8,0,9,0,187,0,135,0,205,0,27,0,57,0,241,0,16,0,137,0]
; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
; AVX512BW-NEXT: vpackuswb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsubb %zmm2, %zmm0, %zmm3
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm3[8],zmm1[8],zmm3[9],zmm1[9],zmm3[10],zmm1[10],zmm3[11],zmm1[11],zmm3[12],zmm1[12],zmm3[13],zmm1[13],zmm3[14],zmm1[14],zmm3[15],zmm1[15],zmm3[24],zmm1[24],zmm3[25],zmm1[25],zmm3[26],zmm1[26],zmm3[27],zmm1[27],zmm3[28],zmm1[28],zmm3[29],zmm1[29],zmm3[30],zmm1[30],zmm3[31],zmm1[31],zmm3[40],zmm1[40],zmm3[41],zmm1[41],zmm3[42],zmm1[42],zmm3[43],zmm1[43],zmm3[44],zmm1[44],zmm3[45],zmm1[45],zmm3[46],zmm1[46],zmm3[47],zmm1[47],zmm3[56],zmm1[56],zmm3[57],zmm1[57],zmm3[58],zmm1[58],zmm3[59],zmm1[59],zmm3[60],zmm1[60],zmm3[61],zmm1[61],zmm3[62],zmm1[62],zmm3[63],zmm1[63]
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm4 # [0,0,0,0,0,0,128,0,128,0,0,0,0,0,0,0,0,128,0,0,0,0,0,128,0,0,0,128,0,0,0,128]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm4 # [0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
; AVX512BW-NEXT: vpsrlw $8, %zmm4, %zmm4
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm3[0],zmm1[0],zmm3[1],zmm1[1],zmm3[2],zmm1[2],zmm3[3],zmm1[3],zmm3[4],zmm1[4],zmm3[5],zmm1[5],zmm3[6],zmm1[6],zmm3[7],zmm1[7],zmm3[16],zmm1[16],zmm3[17],zmm1[17],zmm3[18],zmm1[18],zmm3[19],zmm1[19],zmm3[20],zmm1[20],zmm3[21],zmm1[21],zmm3[22],zmm1[22],zmm3[23],zmm1[23],zmm3[32],zmm1[32],zmm3[33],zmm1[33],zmm3[34],zmm1[34],zmm3[35],zmm1[35],zmm3[36],zmm1[36],zmm3[37],zmm1[37],zmm3[38],zmm1[38],zmm3[39],zmm1[39],zmm3[48],zmm1[48],zmm3[49],zmm1[49],zmm3[50],zmm1[50],zmm3[51],zmm1[51],zmm3[52],zmm1[52],zmm3[53],zmm1[53],zmm3[54],zmm1[54],zmm3[55],zmm1[55]
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [128,0,0,0,128,0,0,0,128,0,0,0,0,0,128,0,0,0,0,0,0,0,0,128,0,128,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
; AVX512BW-NEXT: vpackuswb %zmm4, %zmm3, %zmm3
; AVX512BW-NEXT: vpaddb %zmm2, %zmm3, %zmm2
diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll
index 6e1bf25..d0bb90c 100644
--- a/llvm/test/CodeGen/X86/vector-mul.ll
+++ b/llvm/test/CodeGen/X86/vector-mul.ll
@@ -130,31 +130,31 @@ define <4 x i32> @mul_v4i32_1_2_4_8(<4 x i32> %a0) nounwind {
; X86-SSE2-LABEL: mul_v4i32_1_2_4_8:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,2,4,8]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [2,u,8,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-SSE2-NEXT: retl
;
; X86-SSE4-LABEL: mul_v4i32_1_2_4_8:
; X86-SSE4: # %bb.0:
-; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,2,4,8]
; X86-SSE4-NEXT: retl
;
; X64-SSE2-LABEL: mul_v4i32_1_2_4_8:
; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8]
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2,u,8,u]
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-SSE2-NEXT: retq
;
; X64-SSE4-LABEL: mul_v4i32_1_2_4_8:
; X64-SSE4: # %bb.0:
-; X64-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8]
; X64-SSE4-NEXT: retq
;
; X64-XOP-LABEL: mul_v4i32_1_2_4_8:
@@ -190,12 +190,12 @@ define <4 x i32> @mul_v4i32_1_2_4_8_optsize(<4 x i32> %a0) nounwind optsize {
;
; X86-SSE4-LABEL: mul_v4i32_1_2_4_8_optsize:
; X86-SSE4: # %bb.0:
-; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,2,4,8]
; X86-SSE4-NEXT: retl
;
; X64-SSE4-LABEL: mul_v4i32_1_2_4_8_optsize:
; X64-SSE4: # %bb.0:
-; X64-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8]
; X64-SSE4-NEXT: retq
;
; X64-XOP-LABEL: mul_v4i32_1_2_4_8_optsize:
@@ -989,7 +989,7 @@ define <2 x i64> @mul_v2i64_17_65(<2 x i64> %a0) nounwind {
;
; X64-AVX512DQ-LABEL: mul_v2i64_17_65:
; X64-AVX512DQ: # %bb.0:
-; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [17,65]
; X64-AVX512DQ-NEXT: retq
%1 = mul <2 x i64> %a0, <i64 17, i64 65>
ret <2 x i64> %1
@@ -999,36 +999,36 @@ define <4 x i32> @mul_v4i32_5_17_33_65(<4 x i32> %a0) nounwind {
; X86-SSE2-LABEL: mul_v4i32_5_17_33_65:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [5,17,33,65]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [17,u,65,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-SSE2-NEXT: retl
;
; X86-SSE4-LABEL: mul_v4i32_5_17_33_65:
; X86-SSE4: # %bb.0:
-; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [5,17,33,65]
; X86-SSE4-NEXT: retl
;
; X64-SSE2-LABEL: mul_v4i32_5_17_33_65:
; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [5,17,33,65]
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [17,u,65,u]
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-SSE2-NEXT: retq
;
; X64-SSE4-LABEL: mul_v4i32_5_17_33_65:
; X64-SSE4: # %bb.0:
-; X64-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [5,17,33,65]
; X64-SSE4-NEXT: retq
;
; X64-AVX-LABEL: mul_v4i32_5_17_33_65:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [5,17,33,65]
; X64-AVX-NEXT: retq
%1 = mul <4 x i32> %a0, <i32 5, i32 17, i32 33, i32 65>
ret <4 x i32> %1
@@ -1384,7 +1384,7 @@ define <2 x i64> @mul_v2i64_15_63(<2 x i64> %a0) nounwind {
;
; X64-AVX512DQ-LABEL: mul_v2i64_15_63:
; X64-AVX512DQ: # %bb.0:
-; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [15,63]
; X64-AVX512DQ-NEXT: retq
%1 = mul <2 x i64> %a0, <i64 15, i64 63>
ret <2 x i64> %1
@@ -1427,7 +1427,7 @@ define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind {
; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
; X64-SSE2-NEXT: psrlq $32, %xmm3
; X64-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4294967295,4294967295]
; X64-SSE2-NEXT: paddq %xmm3, %xmm0
; X64-SSE2-NEXT: psllq $32, %xmm0
; X64-SSE2-NEXT: paddq %xmm2, %xmm0
@@ -1441,7 +1441,7 @@ define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind {
; X64-SSE4-NEXT: movdqa %xmm0, %xmm3
; X64-SSE4-NEXT: psrlq $32, %xmm3
; X64-SSE4-NEXT: pmuludq %xmm1, %xmm3
-; X64-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4294967295,4294967295]
; X64-SSE4-NEXT: paddq %xmm3, %xmm0
; X64-SSE4-NEXT: psllq $32, %xmm0
; X64-SSE4-NEXT: paddq %xmm2, %xmm0
@@ -1453,7 +1453,7 @@ define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind {
; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm3
; X64-XOP-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
-; X64-XOP-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-XOP-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4294967295,4294967295]
; X64-XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; X64-XOP-NEXT: vpsllq $32, %xmm0, %xmm0
; X64-XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0
@@ -1465,7 +1465,7 @@ define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind {
; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3
; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
-; X64-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4294967295,4294967295]
; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
; X64-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0
@@ -1473,7 +1473,7 @@ define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind {
;
; X64-AVX512DQ-LABEL: mul_v2i64_neg_15_63:
; X64-AVX512DQ: # %bb.0:
-; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [18446744073709551601,18446744073709551553]
; X64-AVX512DQ-NEXT: retq
%1 = mul <2 x i64> %a0, <i64 -15, i64 -63>
ret <2 x i64> %1
@@ -1516,7 +1516,7 @@ define <2 x i64> @mul_v2i64_neg_17_65(<2 x i64> %a0) nounwind {
; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
; X64-SSE2-NEXT: psrlq $32, %xmm3
; X64-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4294967295,4294967295]
; X64-SSE2-NEXT: paddq %xmm3, %xmm0
; X64-SSE2-NEXT: psllq $32, %xmm0
; X64-SSE2-NEXT: paddq %xmm2, %xmm0
@@ -1530,7 +1530,7 @@ define <2 x i64> @mul_v2i64_neg_17_65(<2 x i64> %a0) nounwind {
; X64-SSE4-NEXT: movdqa %xmm0, %xmm3
; X64-SSE4-NEXT: psrlq $32, %xmm3
; X64-SSE4-NEXT: pmuludq %xmm1, %xmm3
-; X64-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4294967295,4294967295]
; X64-SSE4-NEXT: paddq %xmm3, %xmm0
; X64-SSE4-NEXT: psllq $32, %xmm0
; X64-SSE4-NEXT: paddq %xmm2, %xmm0
@@ -1542,7 +1542,7 @@ define <2 x i64> @mul_v2i64_neg_17_65(<2 x i64> %a0) nounwind {
; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm3
; X64-XOP-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
-; X64-XOP-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-XOP-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4294967295,4294967295]
; X64-XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; X64-XOP-NEXT: vpsllq $32, %xmm0, %xmm0
; X64-XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0
@@ -1554,7 +1554,7 @@ define <2 x i64> @mul_v2i64_neg_17_65(<2 x i64> %a0) nounwind {
; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3
; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
-; X64-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4294967295,4294967295]
; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
; X64-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0
@@ -1562,7 +1562,7 @@ define <2 x i64> @mul_v2i64_neg_17_65(<2 x i64> %a0) nounwind {
;
; X64-AVX512DQ-LABEL: mul_v2i64_neg_17_65:
; X64-AVX512DQ: # %bb.0:
-; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [18446744073709551599,18446744073709551551]
; X64-AVX512DQ-NEXT: retq
%1 = mul <2 x i64> %a0, <i64 -17, i64 -65>
ret <2 x i64> %1
@@ -1600,7 +1600,7 @@ define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind {
; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
; X86-SSE2-NEXT: psrlq $32, %xmm3
; X86-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,0,0,u,u,u,u,255,255,255,255,u,u,u,u]
; X86-SSE2-NEXT: paddq %xmm3, %xmm0
; X86-SSE2-NEXT: psllq $32, %xmm0
; X86-SSE2-NEXT: paddq %xmm2, %xmm0
@@ -1614,7 +1614,7 @@ define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind {
; X86-SSE4-NEXT: movdqa %xmm0, %xmm3
; X86-SSE4-NEXT: psrlq $32, %xmm3
; X86-SSE4-NEXT: pmuludq %xmm1, %xmm3
-; X86-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,0,0,u,u,u,u,255,255,255,255,u,u,u,u]
; X86-SSE4-NEXT: paddq %xmm3, %xmm0
; X86-SSE4-NEXT: psllq $32, %xmm0
; X86-SSE4-NEXT: paddq %xmm2, %xmm0
@@ -1628,7 +1628,7 @@ define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind {
; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
; X64-SSE2-NEXT: psrlq $32, %xmm3
; X64-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,0,0,0,0,0,255,255,255,255,0,0,0,0]
; X64-SSE2-NEXT: paddq %xmm3, %xmm0
; X64-SSE2-NEXT: psllq $32, %xmm0
; X64-SSE2-NEXT: paddq %xmm2, %xmm0
@@ -1642,7 +1642,7 @@ define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind {
; X64-SSE4-NEXT: movdqa %xmm0, %xmm3
; X64-SSE4-NEXT: psrlq $32, %xmm3
; X64-SSE4-NEXT: pmuludq %xmm1, %xmm3
-; X64-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,0,0,0,0,0,255,255,255,255,0,0,0,0]
; X64-SSE4-NEXT: paddq %xmm3, %xmm0
; X64-SSE4-NEXT: psllq $32, %xmm0
; X64-SSE4-NEXT: paddq %xmm2, %xmm0
@@ -1654,7 +1654,7 @@ define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind {
; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm3
; X64-XOP-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
-; X64-XOP-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-XOP-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,0,0,0,0,0,0,255,255,255,255,0,0,0,0]
; X64-XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; X64-XOP-NEXT: vpsllq $32, %xmm0, %xmm0
; X64-XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0
@@ -1666,7 +1666,7 @@ define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind {
; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3
; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
-; X64-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,0,0,0,0,0,0,255,255,255,255,0,0,0,0]
; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
; X64-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0
@@ -1674,7 +1674,7 @@ define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind {
;
; X64-AVX512DQ-LABEL: mul_v2i64_neg_0_1:
; X64-AVX512DQ: # %bb.0:
-; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255]
; X64-AVX512DQ-NEXT: retq
%1 = mul <2 x i64> %a0, <i64 0, i64 -1>
ret <2 x i64> %1
@@ -1689,7 +1689,7 @@ define <2 x i64> @mul_v2i64_15_neg_63(<2 x i64> %a0) nounwind {
; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
; X86-SSE2-NEXT: psrlq $32, %xmm3
; X86-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,0,0,u,u,u,u,255,255,255,255,u,u,u,u]
; X86-SSE2-NEXT: paddq %xmm3, %xmm0
; X86-SSE2-NEXT: psllq $32, %xmm0
; X86-SSE2-NEXT: paddq %xmm2, %xmm0
@@ -1703,7 +1703,7 @@ define <2 x i64> @mul_v2i64_15_neg_63(<2 x i64> %a0) nounwind {
; X86-SSE4-NEXT: movdqa %xmm0, %xmm3
; X86-SSE4-NEXT: psrlq $32, %xmm3
; X86-SSE4-NEXT: pmuludq %xmm1, %xmm3
-; X86-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,0,0,u,u,u,u,255,255,255,255,u,u,u,u]
; X86-SSE4-NEXT: paddq %xmm3, %xmm0
; X86-SSE4-NEXT: psllq $32, %xmm0
; X86-SSE4-NEXT: paddq %xmm2, %xmm0
@@ -1717,7 +1717,7 @@ define <2 x i64> @mul_v2i64_15_neg_63(<2 x i64> %a0) nounwind {
; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
; X64-SSE2-NEXT: psrlq $32, %xmm3
; X64-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,0,0,0,0,0,255,255,255,255,0,0,0,0]
; X64-SSE2-NEXT: paddq %xmm3, %xmm0
; X64-SSE2-NEXT: psllq $32, %xmm0
; X64-SSE2-NEXT: paddq %xmm2, %xmm0
@@ -1731,7 +1731,7 @@ define <2 x i64> @mul_v2i64_15_neg_63(<2 x i64> %a0) nounwind {
; X64-SSE4-NEXT: movdqa %xmm0, %xmm3
; X64-SSE4-NEXT: psrlq $32, %xmm3
; X64-SSE4-NEXT: pmuludq %xmm1, %xmm3
-; X64-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,0,0,0,0,0,255,255,255,255,0,0,0,0]
; X64-SSE4-NEXT: paddq %xmm3, %xmm0
; X64-SSE4-NEXT: psllq $32, %xmm0
; X64-SSE4-NEXT: paddq %xmm2, %xmm0
@@ -1743,7 +1743,7 @@ define <2 x i64> @mul_v2i64_15_neg_63(<2 x i64> %a0) nounwind {
; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm3
; X64-XOP-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
-; X64-XOP-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-XOP-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,0,0,0,0,0,0,255,255,255,255,0,0,0,0]
; X64-XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; X64-XOP-NEXT: vpsllq $32, %xmm0, %xmm0
; X64-XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0
@@ -1755,7 +1755,7 @@ define <2 x i64> @mul_v2i64_15_neg_63(<2 x i64> %a0) nounwind {
; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3
; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
-; X64-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,0,0,0,0,0,0,255,255,255,255,0,0,0,0]
; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
; X64-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0
@@ -1763,7 +1763,7 @@ define <2 x i64> @mul_v2i64_15_neg_63(<2 x i64> %a0) nounwind {
;
; X64-AVX512DQ-LABEL: mul_v2i64_15_neg_63:
; X64-AVX512DQ: # %bb.0:
-; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [15,18446744073709551553]
; X64-AVX512DQ-NEXT: retq
%1 = mul <2 x i64> %a0, <i64 15, i64 -63>
ret <2 x i64> %1
@@ -1773,36 +1773,36 @@ define <4 x i32> @mul_v4i32_0_15_31_7(<4 x i32> %a0) nounwind {
; X86-SSE2-LABEL: mul_v4i32_0_15_31_7:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,15,31,7]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [15,u,7,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-SSE2-NEXT: retl
;
; X86-SSE4-LABEL: mul_v4i32_0_15_31_7:
; X86-SSE4: # %bb.0:
-; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,15,31,7]
; X86-SSE4-NEXT: retl
;
; X64-SSE2-LABEL: mul_v4i32_0_15_31_7:
; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,15,31,7]
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [15,u,7,u]
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-SSE2-NEXT: retq
;
; X64-SSE4-LABEL: mul_v4i32_0_15_31_7:
; X64-SSE4: # %bb.0:
-; X64-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,15,31,7]
; X64-SSE4-NEXT: retq
;
; X64-AVX-LABEL: mul_v4i32_0_15_31_7:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,15,31,7]
; X64-AVX-NEXT: retq
%1 = mul <4 x i32> %a0, <i32 0, i32 15, i32 31, i32 7>
ret <4 x i32> %1
@@ -1947,7 +1947,7 @@ define <2 x i64> @mul_v2i64_68_132(<2 x i64> %x) nounwind {
;
; X64-AVX512DQ-LABEL: mul_v2i64_68_132:
; X64-AVX512DQ: # %bb.0:
-; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [68,132]
; X64-AVX512DQ-NEXT: retq
%mul = mul <2 x i64> %x, <i64 68, i64 132>
ret <2 x i64> %mul
@@ -2009,7 +2009,7 @@ define <2 x i64> @mul_v2i64_60_120(<2 x i64> %x) nounwind {
;
; X64-AVX512DQ-LABEL: mul_v2i64_60_120:
; X64-AVX512DQ: # %bb.0:
-; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [60,124]
; X64-AVX512DQ-NEXT: retq
%mul = mul <2 x i64> %x, <i64 60, i64 124>
ret <2 x i64> %mul
diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
index 983ae59..3d85d55 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
@@ -851,7 +851,7 @@ define i16 @test_v4i16_v4i8(<4 x i16> %a0) {
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,128,0,64,0,32,u,u,u,u,u,u,u,u]
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE2-NEXT: paddw %xmm0, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll
index 93f4ce7..0bf5a8d 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-128.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll
@@ -1092,9 +1092,9 @@ define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind {
; SSE2-LABEL: constant_rotate_v4i32:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,64,128]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,u,128,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1106,8 +1106,8 @@ define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind {
; SSE41-LABEL: constant_rotate_v4i32:
; SSE41: # %bb.0:
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,u,128,u]
+; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,64,128]
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1118,8 +1118,8 @@ define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind {
; AVX1-LABEL: constant_rotate_v4i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,u,128,u]
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,64,128]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1156,9 +1156,9 @@ define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind {
; X86-SSE2-LABEL: constant_rotate_v4i32:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,64,128]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [32,u,128,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll
index 64c3118..5ae3e2f 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-256.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll
@@ -895,13 +895,13 @@ define <8 x i32> @constant_rotate_v8i32(<8 x i32> %a) nounwind {
; AVX1-LABEL: constant_rotate_v8i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,u,128,u]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [512,u,2048,u]
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,64,128]
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [256,512,1024,2048]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vmovshdup {{.*#+}} ymm2 = ymm0[1,1,3,3,5,5,7,7]
; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
index d565ef0..1602cde 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -1673,7 +1673,7 @@ define <16 x i8> @constant_shift_v16i8_pairs(<16 x i8> %a) nounwind {
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535]
; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,128,0,16,0,2,0,32,0,64,0,0,0,8,0,4]
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [64,64,8,8,1,1,16,16,32,32,128,128,4,4,2,2]
@@ -1750,7 +1750,7 @@ define <16 x i8> @constant_shift_v16i8_pairs(<16 x i8> %a) nounwind {
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535]
; X86-SSE-NEXT: pandn %xmm0, %xmm1
-; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,128,0,16,0,2,0,32,0,64,0,0,0,8,0,4]
; X86-SSE-NEXT: por %xmm1, %xmm0
; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [64,64,8,8,1,1,16,16,32,32,128,128,4,4,2,2]
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
index 8cb2c7b..a847da6 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
@@ -1223,7 +1223,7 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,128,0,64,0,32,0,16,0,8,0,4,0,2]
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
@@ -1275,7 +1275,7 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
; X86-SSE-NEXT: pandn %xmm0, %xmm1
-; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,0,128,0,64,0,32,0,16,0,8,0,4,0,2]
; X86-SSE-NEXT: por %xmm1, %xmm0
; X86-SSE-NEXT: retl
%shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
index 57874c4..eb39b6a 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
@@ -1480,7 +1480,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,128,0,64,0,32,u,u,u,u,u,u,u,u]
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
@@ -1532,7 +1532,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
; X86-SSE-NEXT: pandn %xmm0, %xmm1
-; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,0,128,0,64,0,32,u,u,u,u,u,u,u,u]
; X86-SSE-NEXT: por %xmm1, %xmm0
; X86-SSE-NEXT: retl
%shift = lshr <4 x i16> %a, <i16 0, i16 1, i16 2, i16 3>
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
index 99dac74..3085c32 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -987,21 +987,21 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
; SSE2-LABEL: constant_shift_v4i32:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,64,128]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,u,128,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v4i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,64,128]
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_shift_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,64,128]
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v4i32:
@@ -1032,9 +1032,9 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
; X86-SSE-LABEL: constant_shift_v4i32:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,64,128]
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [32,u,128,u]
; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-SSE-NEXT: retl
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
index b56a8b5..f9ccd1e 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
@@ -1117,9 +1117,9 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
; AVX1-LABEL: constant_shift_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [16,32,64,128]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,512,256,128]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -1153,9 +1153,9 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
;
; X86-AVX1-LABEL: constant_shift_v8i32:
; X86-AVX1: # %bb.0:
-; X86-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 # [16,32,64,128]
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; X86-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [256,512,256,128]
; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; X86-AVX1-NEXT: retl
;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
index 0e20b18..18d79b6 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
@@ -778,9 +778,9 @@ define <16 x i8> @combine_shl_pshufb(<4 x i32> %a0) {
; SSSE3-LABEL: combine_shl_pshufb:
; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,256,65536,65536]
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSSE3-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [256,u,65536,u]
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,2,3,0,5,6,7,4,9,10,11,8,12,13,14,15]
@@ -788,13 +788,13 @@ define <16 x i8> @combine_shl_pshufb(<4 x i32> %a0) {
;
; SSE41-LABEL: combine_shl_pshufb:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,256,65536,65536]
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,2,3,0,5,6,7,4,9,10,11,8,12,13,14,15]
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_shl_pshufb:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,256,65536,65536]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,2,3,0,5,6,7,4,9,10,11,8,12,13,14,15]
; AVX1-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll
index 1af7542..4235377 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-math.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll
@@ -2110,7 +2110,7 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm2, %xmm2
; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2,3]
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
; SSE-NEXT: movaps %xmm2, %xmm0
; SSE-NEXT: retq
@@ -2119,7 +2119,7 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,1,2,3]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -2127,7 +2127,7 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX2-SLOW-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,1,2,3]
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
@@ -2135,7 +2135,7 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
; AVX2-FAST-ALL: # %bb.0:
; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0]
; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-FAST-ALL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-ALL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,1,2,3]
; AVX2-FAST-ALL-NEXT: vzeroupper
; AVX2-FAST-ALL-NEXT: retq
;
@@ -2143,7 +2143,7 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
; AVX2-FAST-PERLANE: # %bb.0:
; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX2-FAST-PERLANE-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-PERLANE-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,1,2,3]
; AVX2-FAST-PERLANE-NEXT: vzeroupper
; AVX2-FAST-PERLANE-NEXT: retq
;
@@ -2151,7 +2151,7 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
; AVX512: # %bb.0:
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,1,2,3]
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = mul <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
@@ -2253,13 +2253,13 @@ define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; SSE-LABEL: trunc_mul_const_v16i64_v16i8:
; SSE: # %bb.0:
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2,3]
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [4,5]
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [6,7]
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [8,9]
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 # [10,11]
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [12,13]
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 # [14,15]
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; SSE-NEXT: pand %xmm8, %xmm7
; SSE-NEXT: pand %xmm8, %xmm6
@@ -2280,18 +2280,18 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
;
; AVX1-LABEL: trunc_mul_const_v16i64_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 # [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm5
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2,3]
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm5 # [4,5]
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm6
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [6,7]
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm6 # [8,9]
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm7
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [10,11]
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm7 # [12,13]
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [14,15]
; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm8 = [255,255]
; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3
; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7
@@ -2313,10 +2313,10 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
;
; AVX2-LABEL: trunc_mul_const_v16i64_v16i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
+; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,1,2,3]
+; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [4,5,6,7]
+; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [8,9,10,11]
+; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [12,13,14,15]
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
@@ -2335,8 +2335,8 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
;
; AVX512F-LABEL: trunc_mul_const_v16i64_v16i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
-; AVX512F-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512F-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,1,2,3,4,5,6,7]
+; AVX512F-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [8,9,10,11,12,13,14,15]
; AVX512F-NEXT: vpmovqb %zmm1, %xmm1
; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -2345,8 +2345,8 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
;
; AVX512BW-LABEL: trunc_mul_const_v16i64_v16i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
-; AVX512BW-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [8,9,10,11,12,13,14,15]
; AVX512BW-NEXT: vpmovqb %zmm1, %xmm1
; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -2355,8 +2355,8 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
;
; AVX512DQ-LABEL: trunc_mul_const_v16i64_v16i8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [8,9,10,11,12,13,14,15]
; AVX512DQ-NEXT: vpmovqb %zmm1, %xmm1
; AVX512DQ-NEXT: vpmovqb %zmm0, %xmm0
; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -2371,27 +2371,27 @@ define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
; SSE-LABEL: trunc_mul_const_v16i32_v16i8:
; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,1,2,3]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [1,u,3,u]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [5,u,7,u]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [8,9,10,11]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [9,u,11,u]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [12,13,14,15]
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [13,u,15,u]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
@@ -2406,12 +2406,12 @@ define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
;
; AVX1-LABEL: trunc_mul_const_v16i32_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [0,1,2,3]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4,5,6,7]
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3 # [8,9,10,11]
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [12,13,14,15]
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255]
; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
@@ -2425,8 +2425,8 @@ define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
;
; AVX2-LABEL: trunc_mul_const_v16i32_v16i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,1,2,3,4,5,6,7]
+; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [8,9,10,11,12,13,14,15]
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
@@ -2439,7 +2439,7 @@ define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
;
; AVX512-LABEL: trunc_mul_const_v16i32_v16i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vselect-avx.ll b/llvm/test/CodeGen/X86/vselect-avx.ll
index 17315c4..1c5be03 100644
--- a/llvm/test/CodeGen/X86/vselect-avx.ll
+++ b/llvm/test/CodeGen/X86/vselect-avx.ll
@@ -95,7 +95,7 @@ bb:
define void @test3(<4 x i32> %induction30, ptr %tmp16, ptr %tmp17, <4 x i16> %tmp3, <4 x i16> %tmp12) {
; AVX1-LABEL: test3:
; AVX1: ## %bb.0:
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ## [2863311531,2863311531,2863311531,2863311531]
; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vselect-pcmp.ll b/llvm/test/CodeGen/X86/vselect-pcmp.ll
index 8543e9f..16700d4 100644
--- a/llvm/test/CodeGen/X86/vselect-pcmp.ll
+++ b/llvm/test/CodeGen/X86/vselect-pcmp.ll
@@ -1046,7 +1046,7 @@ define <2 x i64> @blend_mask_cond_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z
define <4 x i32> @blend_mask_cond_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
; AVX1-LABEL: blend_mask_cond_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,4194304,1073741824,2147483648]
; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX1-NEXT: retq
;
@@ -1211,9 +1211,9 @@ define <4 x i64> @blend_mask_cond_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %z
define <8 x i32> @blend_mask_cond_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z) {
; AVX1-LABEL: blend_mask_cond_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 # [2147483648,1073741824,268435456,536870912]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [268435456,2097152,1073741824,524288]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
; AVX1-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/x86-shrink-wrap-unwind.ll b/llvm/test/CodeGen/X86/x86-shrink-wrap-unwind.ll
index b2064b1..02d4d88 100644
--- a/llvm/test/CodeGen/X86/x86-shrink-wrap-unwind.ll
+++ b/llvm/test/CodeGen/X86/x86-shrink-wrap-unwind.ll
@@ -181,40 +181,38 @@ define zeroext i1 @segmentedStack(ptr readonly %vk1, ptr readonly %vk2, i64 %key
; CHECK-LABEL: segmentedStack:
; CHECK: ## %bb.0:
; CHECK-NEXT: cmpq %gs:816, %rsp
-; CHECK-NEXT: jbe LBB3_6
+; CHECK-NEXT: jbe LBB3_7
; CHECK-NEXT: LBB3_1: ## %entry
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: testq %rdi, %rdi
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: testq %rsi, %rsi
-; CHECK-NEXT: sete %cl
-; CHECK-NEXT: orb %al, %cl
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: orq %rsi, %rax
; CHECK-NEXT: sete %al
-; CHECK-NEXT: testb %cl, %cl
-; CHECK-NEXT: jne LBB3_4
-; CHECK-NEXT: ## %bb.2: ## %if.end4.i
+; CHECK-NEXT: testq %rdi, %rdi
+; CHECK-NEXT: je LBB3_5
+; CHECK-NEXT: ## %bb.2: ## %entry
+; CHECK-NEXT: testq %rsi, %rsi
+; CHECK-NEXT: je LBB3_5
+; CHECK-NEXT: ## %bb.3: ## %if.end4.i
; CHECK-NEXT: movq 8(%rdi), %rdx
; CHECK-NEXT: cmpq 8(%rsi), %rdx
-; CHECK-NEXT: jne LBB3_5
-; CHECK-NEXT: ## %bb.3: ## %land.rhs.i.i
+; CHECK-NEXT: jne LBB3_6
+; CHECK-NEXT: ## %bb.4: ## %land.rhs.i.i
; CHECK-NEXT: movq (%rsi), %rsi
; CHECK-NEXT: movq (%rdi), %rdi
; CHECK-NEXT: callq _memcmp
; CHECK-NEXT: testl %eax, %eax
; CHECK-NEXT: sete %al
-; CHECK-NEXT: LBB3_4: ## %__go_ptr_strings_equal.exit
+; CHECK-NEXT: LBB3_5: ## %__go_ptr_strings_equal.exit
; CHECK-NEXT: ## kill: def $al killed $al killed $eax
; CHECK-NEXT: popq %rcx
; CHECK-NEXT: retq
-; CHECK-NEXT: LBB3_5:
+; CHECK-NEXT: LBB3_6:
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: ## kill: def $al killed $al killed $eax
; CHECK-NEXT: popq %rcx
; CHECK-NEXT: retq
-; CHECK-NEXT: LBB3_6:
+; CHECK-NEXT: LBB3_7:
; CHECK-NEXT: movl $8, %r10d
; CHECK-NEXT: movl $0, %r11d
; CHECK-NEXT: callq ___morestack
@@ -224,43 +222,41 @@ define zeroext i1 @segmentedStack(ptr readonly %vk1, ptr readonly %vk2, i64 %key
; NOCOMPACTUNWIND-LABEL: segmentedStack:
; NOCOMPACTUNWIND: # %bb.0:
; NOCOMPACTUNWIND-NEXT: cmpq %fs:112, %rsp
-; NOCOMPACTUNWIND-NEXT: jbe .LBB3_6
+; NOCOMPACTUNWIND-NEXT: jbe .LBB3_7
; NOCOMPACTUNWIND-NEXT: .LBB3_1: # %entry
; NOCOMPACTUNWIND-NEXT: pushq %rax
; NOCOMPACTUNWIND-NEXT: .cfi_def_cfa_offset 16
-; NOCOMPACTUNWIND-NEXT: testq %rdi, %rdi
-; NOCOMPACTUNWIND-NEXT: sete %al
-; NOCOMPACTUNWIND-NEXT: testq %rsi, %rsi
-; NOCOMPACTUNWIND-NEXT: sete %cl
-; NOCOMPACTUNWIND-NEXT: orb %al, %cl
; NOCOMPACTUNWIND-NEXT: movq %rdi, %rax
; NOCOMPACTUNWIND-NEXT: orq %rsi, %rax
; NOCOMPACTUNWIND-NEXT: sete %al
-; NOCOMPACTUNWIND-NEXT: testb %cl, %cl
-; NOCOMPACTUNWIND-NEXT: jne .LBB3_4
-; NOCOMPACTUNWIND-NEXT: # %bb.2: # %if.end4.i
+; NOCOMPACTUNWIND-NEXT: testq %rdi, %rdi
+; NOCOMPACTUNWIND-NEXT: je .LBB3_5
+; NOCOMPACTUNWIND-NEXT: # %bb.2: # %entry
+; NOCOMPACTUNWIND-NEXT: testq %rsi, %rsi
+; NOCOMPACTUNWIND-NEXT: je .LBB3_5
+; NOCOMPACTUNWIND-NEXT: # %bb.3: # %if.end4.i
; NOCOMPACTUNWIND-NEXT: movq 8(%rdi), %rdx
; NOCOMPACTUNWIND-NEXT: cmpq 8(%rsi), %rdx
-; NOCOMPACTUNWIND-NEXT: jne .LBB3_5
-; NOCOMPACTUNWIND-NEXT: # %bb.3: # %land.rhs.i.i
+; NOCOMPACTUNWIND-NEXT: jne .LBB3_6
+; NOCOMPACTUNWIND-NEXT: # %bb.4: # %land.rhs.i.i
; NOCOMPACTUNWIND-NEXT: movq (%rsi), %rsi
; NOCOMPACTUNWIND-NEXT: movq (%rdi), %rdi
; NOCOMPACTUNWIND-NEXT: callq memcmp@PLT
; NOCOMPACTUNWIND-NEXT: testl %eax, %eax
; NOCOMPACTUNWIND-NEXT: sete %al
-; NOCOMPACTUNWIND-NEXT: .LBB3_4: # %__go_ptr_strings_equal.exit
+; NOCOMPACTUNWIND-NEXT: .LBB3_5: # %__go_ptr_strings_equal.exit
; NOCOMPACTUNWIND-NEXT: # kill: def $al killed $al killed $eax
; NOCOMPACTUNWIND-NEXT: popq %rcx
; NOCOMPACTUNWIND-NEXT: .cfi_def_cfa_offset 8
; NOCOMPACTUNWIND-NEXT: retq
-; NOCOMPACTUNWIND-NEXT: .LBB3_5:
+; NOCOMPACTUNWIND-NEXT: .LBB3_6:
; NOCOMPACTUNWIND-NEXT: .cfi_def_cfa_offset 16
; NOCOMPACTUNWIND-NEXT: xorl %eax, %eax
; NOCOMPACTUNWIND-NEXT: # kill: def $al killed $al killed $eax
; NOCOMPACTUNWIND-NEXT: popq %rcx
; NOCOMPACTUNWIND-NEXT: .cfi_def_cfa_offset 8
; NOCOMPACTUNWIND-NEXT: retq
-; NOCOMPACTUNWIND-NEXT: .LBB3_6:
+; NOCOMPACTUNWIND-NEXT: .LBB3_7:
; NOCOMPACTUNWIND-NEXT: movl $8, %r10d
; NOCOMPACTUNWIND-NEXT: movl $0, %r11d
; NOCOMPACTUNWIND-NEXT: callq __morestack
diff --git a/llvm/test/CodeGen/X86/zero-call-used-regs-simd.ll b/llvm/test/CodeGen/X86/zero-call-used-regs-simd.ll
new file mode 100644
index 0000000..d9253e0
--- /dev/null
+++ b/llvm/test/CodeGen/X86/zero-call-used-regs-simd.ll
@@ -0,0 +1,216 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 -verify-machineinstrs | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx -verify-machineinstrs | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 -verify-machineinstrs | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512vl -verify-machineinstrs | FileCheck %s --check-prefixes=AVX512,AVX512VL
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512vl,+avx512bw -verify-machineinstrs | FileCheck %s --check-prefixes=AVX512,AVX512BW
+
+define void @zero_xmm(<4 x i32> %arg) #0 {
+; SSE-LABEL: zero_xmm:
+; SSE: # %bb.0:
+; SSE-NEXT: movaps %xmm0, 0
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: zero_xmm:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovaps %xmm0, 0
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: zero_xmm:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovaps %xmm0, 0
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ store <4 x i32> %arg, ptr null, align 32
+ ret void
+}
+
+define void @zero_ymm(<8 x i32> %arg) #0 {
+; SSE-LABEL: zero_ymm:
+; SSE: # %bb.0:
+; SSE-NEXT: movaps %xmm1, 16
+; SSE-NEXT: movaps %xmm0, 0
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: zero_ymm:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovaps %ymm0, 0
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: zero_ymm:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovaps %ymm0, 0
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ store <8 x i32> %arg, ptr null, align 32
+ ret void
+}
+
+define void @zero_zmm(<16 x i32> %arg) #0 {
+; SSE-LABEL: zero_zmm:
+; SSE: # %bb.0:
+; SSE-NEXT: movaps %xmm3, 48
+; SSE-NEXT: movaps %xmm2, 32
+; SSE-NEXT: movaps %xmm1, 16
+; SSE-NEXT: movaps %xmm0, 0
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: xorps %xmm2, %xmm2
+; SSE-NEXT: xorps %xmm3, %xmm3
+; SSE-NEXT: retq
+;
+; AVX-LABEL: zero_zmm:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovaps %ymm1, 32
+; AVX-NEXT: vmovaps %ymm0, 0
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: zero_zmm:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovups %zmm0, 0
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ store <16 x i32> %arg, ptr null, align 32
+ ret void
+}
+
+define void @zero_k(<8 x i32> %arg, <8 x i1> %mask) #0 {
+; SSE-LABEL: zero_k:
+; SSE: # %bb.0:
+; SSE-NEXT: psllw $15, %xmm2
+; SSE-NEXT: packsswb %xmm2, %xmm2
+; SSE-NEXT: pmovmskb %xmm2, %eax
+; SSE-NEXT: testb $1, %al
+; SSE-NEXT: jne .LBB3_1
+; SSE-NEXT: # %bb.2: # %else
+; SSE-NEXT: testb $2, %al
+; SSE-NEXT: jne .LBB3_3
+; SSE-NEXT: .LBB3_4: # %else2
+; SSE-NEXT: testb $4, %al
+; SSE-NEXT: jne .LBB3_5
+; SSE-NEXT: .LBB3_6: # %else4
+; SSE-NEXT: testb $8, %al
+; SSE-NEXT: jne .LBB3_7
+; SSE-NEXT: .LBB3_8: # %else6
+; SSE-NEXT: testb $16, %al
+; SSE-NEXT: jne .LBB3_9
+; SSE-NEXT: .LBB3_10: # %else8
+; SSE-NEXT: testb $32, %al
+; SSE-NEXT: jne .LBB3_11
+; SSE-NEXT: .LBB3_12: # %else10
+; SSE-NEXT: testb $64, %al
+; SSE-NEXT: jne .LBB3_13
+; SSE-NEXT: .LBB3_14: # %else12
+; SSE-NEXT: testb $-128, %al
+; SSE-NEXT: je .LBB3_16
+; SSE-NEXT: .LBB3_15: # %cond.store13
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
+; SSE-NEXT: movd %xmm0, 28
+; SSE-NEXT: .LBB3_16: # %else14
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: pxor %xmm0, %xmm0
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: pxor %xmm2, %xmm2
+; SSE-NEXT: retq
+; SSE-NEXT: .LBB3_1: # %cond.store
+; SSE-NEXT: movd %xmm0, 0
+; SSE-NEXT: testb $2, %al
+; SSE-NEXT: je .LBB3_4
+; SSE-NEXT: .LBB3_3: # %cond.store1
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; SSE-NEXT: movd %xmm2, 4
+; SSE-NEXT: testb $4, %al
+; SSE-NEXT: je .LBB3_6
+; SSE-NEXT: .LBB3_5: # %cond.store3
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; SSE-NEXT: movd %xmm2, 8
+; SSE-NEXT: testb $8, %al
+; SSE-NEXT: je .LBB3_8
+; SSE-NEXT: .LBB3_7: # %cond.store5
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE-NEXT: movd %xmm0, 12
+; SSE-NEXT: testb $16, %al
+; SSE-NEXT: je .LBB3_10
+; SSE-NEXT: .LBB3_9: # %cond.store7
+; SSE-NEXT: movd %xmm1, 16
+; SSE-NEXT: testb $32, %al
+; SSE-NEXT: je .LBB3_12
+; SSE-NEXT: .LBB3_11: # %cond.store9
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE-NEXT: movd %xmm0, 20
+; SSE-NEXT: testb $64, %al
+; SSE-NEXT: je .LBB3_14
+; SSE-NEXT: .LBB3_13: # %cond.store11
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: movd %xmm0, 24
+; SSE-NEXT: testb $-128, %al
+; SSE-NEXT: jne .LBB3_15
+; SSE-NEXT: jmp .LBB3_16
+;
+; AVX1-LABEL: zero_k:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX1-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vmaskmovps %ymm0, %ymm1, 0
+; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: zero_k:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpslld $31, %ymm1, %ymm1
+; AVX2-NEXT: vpmaskmovd %ymm0, %ymm1, 0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: zero_k:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpmovsxwd %xmm1, %ymm1
+; AVX512VL-NEXT: vpslld $31, %ymm1, %ymm1
+; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1
+; AVX512VL-NEXT: vmovdqa32 %ymm0, 0 {%k1}
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: kxorw %k0, %k0, %k1
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: zero_k:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpsllw $15, %xmm1, %xmm1
+; AVX512BW-NEXT: vpmovw2m %xmm1, %k1
+; AVX512BW-NEXT: vmovdqa32 %ymm0, 0 {%k1}
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX512BW-NEXT: kxorq %k0, %k0, %k1
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+ tail call void @llvm.masked.store.v8i32.p0(<8 x i32> %arg, ptr null, i32 32, <8 x i1> %mask)
+ ret void
+}
+
+attributes #0 = { "zero-call-used-regs"="used" }
diff --git a/llvm/test/DebugInfo/AArch64/callsite.mir b/llvm/test/DebugInfo/AArch64/callsite.mir
new file mode 100644
index 0000000..e3bd764
--- /dev/null
+++ b/llvm/test/DebugInfo/AArch64/callsite.mir
@@ -0,0 +1,68 @@
+# This test should not crash when generating call-site information.
+# It was created to make sure that if isCopyLikeInstr in TargetInstrInfo.h
+# returns an undef Dest Reg or Src Reg, we don't try to get a SubReg for it.
+
+# RUN: llc --mtriple=arm64e-apple-ios -start-before=aarch64-asm-printer %s -filetype=obj -o /dev/null --emit-call-site-info
+--- |
+ %struct.rtyuio = type { i8 }
+ define noundef i32 @aserty(ptr noundef %0, ptr noundef %1) local_unnamed_addr #0 !dbg !23 {
+ ret i32 0
+ }
+ define void @asdfgh(ptr noundef %0, ptr noundef %1, i8 noundef zeroext %2) local_unnamed_addr #0 !dbg !53 {
+ %4 = alloca ptr
+ %5 = call ptr @llvm.stackguard()
+ %6 = alloca %struct.rtyuio
+ %7 = icmp eq ptr %1, null
+ br i1 %7, label %10, label %8
+ %9 = tail call i8 @polkiokl(ptr noundef %0) #6
+ br label %10
+ ret void
+ }
+ declare i8 @polkiokl(ptr noundef) local_unnamed_addr #2
+ !llvm.module.flags = !{!2, !8}
+ !llvm.dbg.cu = !{!9}
+ !2 = !{i32 2, !"Debug Info Version", i32 3}
+ !8 = !{i32 7, !"frame-pointer", i32 1}
+ !9 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_11, file: !10, emissionKind: FullDebug, sysroot: "/")
+ !10 = !DIFile(filename: "a.cpp", directory: "/")
+ !23 = distinct !DISubprogram(type: !27, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, unit: !9, retainedNodes: !46)
+ !24 = distinct !DICompositeType(tag: DW_TAG_class_type, identifier: "yshscbshhdvcm")
+ !27 = !DISubroutineType(types: !28)
+ !28 = !{}
+ !30 = !DIDerivedType(tag: DW_TAG_typedef, baseType: !33)
+ !33 = distinct !DICompositeType(tag: DW_TAG_structure_type, identifier: "tyruwyeuiwiybabd")
+ !36 = !DISubroutineType(types: !37)
+ !37 = !{}
+ !46 = !{}
+ !47 = !DILocalVariable(scope: !23, type: !48, flags: DIFlagArtificial | DIFlagObjectPointer)
+ !48 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !24, size: 64)
+ !49 = !DILocalVariable(scope: !23, type: !30)
+ !50 = !DILocation(scope: !23)
+ !51 = !DILocation(scope: !23)
+ !53 = distinct !DISubprogram(type: !36, unit: !9, retainedNodes: !54)
+ !54 = !{}
+name: aserty
+stack:
+ - { id: 0, name: '', type: spill-slot, offset: -8, size: 8, alignment: 8,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 1, name: '', type: spill-slot, offset: -16, size: 8, alignment: 8,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites:
+ - { bb: 0, offset: 9, fwdArgRegs:
+ - { arg: 2, reg: '$w2' } }
+body: |
+ bb.0 (%ir-block.2):
+ DBG_VALUE $x0, $noreg, !47, !DIExpression(), debug-location !50
+ DBG_VALUE $x1, $noreg, !49, !DIExpression(), debug-location !50
+ frame-setup PACIBSP implicit-def $lr, implicit killed $lr, implicit $sp
+ early-clobber $sp = frame-setup STPXpre $fp, killed $lr, $sp, -2 :: (store (s64) into %stack.1), (store (s64) into %stack.0)
+ $fp = frame-setup ADDXri $sp, 0, 0
+ frame-setup CFI_INSTRUCTION def_cfa $w29, 16
+ frame-setup CFI_INSTRUCTION offset $w30, -8
+ frame-setup CFI_INSTRUCTION offset $w29, -16
+ $x2 = ORRXrs $xzr, undef $noreg, 0, implicit $wzr, debug-location !51
+ BL @asdfgh, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit killed $x1, implicit killed $w2, implicit-def $sp, debug-location !51
+...
+name: asdfgh
+body: |
+ bb.2 (%ir-block.10):
diff --git a/llvm/test/DebugInfo/COFF/AArch64/codeview-sve.ll b/llvm/test/DebugInfo/COFF/AArch64/codeview-sve.ll
index 446a84d..ffdc80a 100644
--- a/llvm/test/DebugInfo/COFF/AArch64/codeview-sve.ll
+++ b/llvm/test/DebugInfo/COFF/AArch64/codeview-sve.ll
@@ -101,7 +101,7 @@
; CHECK-NEXT: LocalVariableAddrRange {
; CHECK-NEXT: OffsetStart: .text+0x0
; CHECK-NEXT: ISectStart: 0x0
-; CHECK-NEXT: Range: 0xBC
+; CHECK-NEXT: Range: 0xB8
; CHECK-NEXT: }
; CHECK-NEXT: }
; CHECK-NEXT: ProcEnd {
diff --git a/llvm/test/DebugInfo/X86/shrink-wrap-frame-setup-no-loc.mir b/llvm/test/DebugInfo/X86/shrink-wrap-frame-setup-no-loc.mir
new file mode 100644
index 0000000..b97e916
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/shrink-wrap-frame-setup-no-loc.mir
@@ -0,0 +1,99 @@
+# RUN: %llc_dwarf %s -o - -mtriple=x86_64-unknown-unknown --start-after=livedebugvalues | FileCheck %s
+
+## Check the line number from the ret above `.LBB0_2` doesn't leak onto the
+## frame setup instructions in the `.LBB0_2` block; `pushq %rax` should
+## explicitly get set to line zero.
+
+# CHECK: loop:
+# CHECK-NEXT: .Lfunc_begin0:
+# CHECK-NEXT: .cfi_startproc
+# CHECK-NEXT: # %bb.0:
+# CHECK-NEXT: .file 1 "/" "test.c"
+# CHECK-NEXT: .loc 1 5 16 prologue_end # test.c:5:16
+# CHECK-NEXT: testq %rax, %rax
+# CHECK-NEXT: je .LBB0_2
+# CHECK-NEXT: # %bb.1:
+# CHECK-NEXT: .loc 1 5 16 # test.c:5:16
+# CHECK-NEXT: retq
+# CHECK-NEXT: .LBB0_2:
+# -- Check the .loc below sets the current location to line 0.
+# CHECK-NEXT: .loc 1 0 16 is_stmt 0 # test.c:0:16
+# CHECK-NEXT: pushq %rax
+# CHECK-NEXT: .cfi_def_cfa_offset 16
+# CHECK-NEXT: addq $8, %rsp
+# CHECK-NEXT: .cfi_def_cfa_offset 8
+# CHECK-NEXT: .loc 1 5 16 is_stmt 1 # test.c:5:16
+# CHECK-NEXT: retq
+
+--- |
+ source_filename = "reduced.ll"
+ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+ target triple = "x86_64-unknown-unknown"
+
+ define void @loop(i64 %i) !dbg !4 {
+ entry:
+ %cmp.not = icmp eq i64 %i, 0, !dbg !7
+ br i1 %cmp.not, label %for.body, label %for.end
+
+ for.body: ; preds = %entry
+ %puts10 = tail call i32 null(ptr null)
+ %inc = add i64 0, 0
+ br label %for.end
+
+ for.end: ; preds = %for.body, %entry
+ ret void
+ }
+
+ !llvm.dbg.cu = !{!0}
+ !llvm.module.flags = !{!3}
+
+ !0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 22.0.0git", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, globals: !2, splitDebugInlining: false, nameTableKind: None)
+ !1 = !DIFile(filename: "test.c", directory: "/")
+ !2 = !{}
+ !3 = !{i32 2, !"Debug Info Version", i32 3}
+ !4 = distinct !DISubprogram(name: "loop", scope: !1, file: !1, line: 4, type: !5, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2, keyInstructions: true)
+ !5 = !DISubroutineType(types: !6)
+ !6 = !{null}
+ !7 = !DILocation(line: 5, column: 16, scope: !8, atomGroup: 720, atomRank: 2)
+ !8 = distinct !DILexicalBlock(scope: !4, file: !1, line: 5, column: 9)
+...
+---
+name: loop
+alignment: 16
+tracksRegLiveness: true
+noPhis: true
+isSSA: false
+noVRegs: true
+hasFakeUses: false
+debugInstrRef: true
+tracksDebugUserValues: true
+liveins:
+ - { reg: '$rdi' }
+frameInfo:
+ stackSize: 8
+ offsetAdjustment: -8
+ maxAlignment: 1
+ adjustsStack: true
+ hasCalls: true
+ maxCallFrameSize: 0
+ isCalleeSavedInfoValid: true
+machineFunctionInfo:
+ amxProgModel: None
+body: |
+ bb.0:
+ successors: %bb.1(0x30000000), %bb.2(0x50000000)
+ liveins: $rdi
+
+ TEST64rr undef renamable $rax, undef renamable $rax, implicit-def $eflags, debug-location !7
+ JCC_1 %bb.1, 4, implicit $eflags
+
+ bb.2:
+ RET64 debug-location !7
+
+ bb.1:
+ frame-setup PUSH64r undef $rax, implicit-def $rsp, implicit $rsp
+ frame-setup CFI_INSTRUCTION def_cfa_offset 16
+ $rsp = frame-destroy ADD64ri32 $rsp, 8, implicit-def dead $eflags
+ frame-destroy CFI_INSTRUCTION def_cfa_offset 8
+ RET64 debug-location !7
+...
diff --git a/llvm/test/Instrumentation/AddressSanitizer/asan-funclet.ll b/llvm/test/Instrumentation/AddressSanitizer/asan-funclet.ll
index ae8b2b3..2a7216f 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/asan-funclet.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/asan-funclet.ll
@@ -23,7 +23,7 @@ declare i32 @dummyPersonality(...)
define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr @__CxxFrameHandler3 {
; CHECK-INLINE-LABEL: define void @FuncletPersonality(
-; CHECK-INLINE-SAME: ptr [[PTRPARAM:%.*]]) #[[ATTR3:[0-9]+]] personality ptr @__CxxFrameHandler3 {
+; CHECK-INLINE-SAME: ptr [[PTRPARAM:%.*]]) #[[ATTR2:[0-9]+]] personality ptr @__CxxFrameHandler3 {
; CHECK-INLINE-NEXT: entry:
; CHECK-INLINE-NEXT: [[TMP0:%.*]] = alloca i64, align 32
; CHECK-INLINE-NEXT: store i64 0, ptr [[TMP0]], align 8
@@ -37,33 +37,26 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
; CHECK-INLINE-NEXT: br label [[TMP6]]
; CHECK-INLINE: 6:
; CHECK-INLINE-NEXT: [[TMP7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[TMP4]] ]
+; CHECK-INLINE-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP7]] to ptr
; CHECK-INLINE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 0
; CHECK-INLINE-NEXT: br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]]
-; CHECK-INLINE: 9:
+; CHECK-INLINE: 10:
; CHECK-INLINE-NEXT: [[MYALLOCA:%.*]] = alloca i8, i64 8544, align 32
-; CHECK-INLINE-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64
; CHECK-INLINE-NEXT: br label [[TMP11]]
; CHECK-INLINE: 11:
-; CHECK-INLINE-NEXT: [[TMP12:%.*]] = phi i64 [ [[TMP7]], [[TMP6]] ], [ [[TMP10]], [[TMP9]] ]
-; CHECK-INLINE-NEXT: store i64 [[TMP12]], ptr [[ASAN_LOCAL_STACK_BASE]], align 8
-; CHECK-INLINE-NEXT: [[TMP13:%.*]] = add i64 [[TMP12]], 32
-; CHECK-INLINE-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-; CHECK-INLINE-NEXT: [[TMP15:%.*]] = add i64 [[TMP12]], 8480
-; CHECK-INLINE-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP15]] to ptr
-; CHECK-INLINE-NEXT: [[TMP17:%.*]] = add i64 [[TMP12]], 8496
-; CHECK-INLINE-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
-; CHECK-INLINE-NEXT: [[TMP19:%.*]] = add i64 [[TMP12]], 8512
-; CHECK-INLINE-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-; CHECK-INLINE-NEXT: [[TMP21:%.*]] = add i64 [[TMP12]], 8528
-; CHECK-INLINE-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
-; CHECK-INLINE-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-INLINE-NEXT: [[TMP23:%.*]] = phi ptr [ [[TMP10]], [[TMP6]] ], [ [[MYALLOCA]], [[TMP9]] ]
+; CHECK-INLINE-NEXT: store ptr [[TMP23]], ptr [[ASAN_LOCAL_STACK_BASE]], align 8
+; CHECK-INLINE-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP23]], i64 32
+; CHECK-INLINE-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[TMP23]], i64 8480
+; CHECK-INLINE-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP23]], i64 8496
+; CHECK-INLINE-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP23]], i64 8512
+; CHECK-INLINE-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[TMP23]], i64 8528
; CHECK-INLINE-NEXT: store i64 1102416563, ptr [[TMP23]], align 8
-; CHECK-INLINE-NEXT: [[TMP24:%.*]] = add i64 [[TMP12]], 8
-; CHECK-INLINE-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+; CHECK-INLINE-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[TMP23]], i64 8
; CHECK-INLINE-NEXT: store i64 ptrtoint (ptr @___asan_gen_stack to i64), ptr [[TMP25]], align 8
-; CHECK-INLINE-NEXT: [[TMP26:%.*]] = add i64 [[TMP12]], 16
-; CHECK-INLINE-NEXT: [[TMP27:%.*]] = inttoptr i64 [[TMP26]] to ptr
+; CHECK-INLINE-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[TMP23]], i64 16
; CHECK-INLINE-NEXT: store i64 ptrtoint (ptr @FuncletPersonality to i64), ptr [[TMP27]], align 8
+; CHECK-INLINE-NEXT: [[TMP12:%.*]] = ptrtoint ptr [[TMP23]] to i64
; CHECK-INLINE-NEXT: [[TMP28:%.*]] = lshr i64 [[TMP12]], 3
; CHECK-INLINE-NEXT: [[TMP29:%.*]] = add i64 [[TMP28]], [[TMP1]]
; CHECK-INLINE-NEXT: call void @__asan_set_shadow_f1(i64 [[TMP29]], i64 4)
@@ -87,21 +80,22 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
; CHECK-INLINE-NEXT: call void @__asan_set_shadow_f3(i64 [[TMP38]], i64 1)
; CHECK-INLINE-NEXT: [[TMP39:%.*]] = add i64 [[TMP29]], 1066
; CHECK-INLINE-NEXT: call void @__asan_set_shadow_04(i64 [[TMP39]], i64 1)
+; CHECK-INLINE-NEXT: [[TMP21:%.*]] = ptrtoint ptr [[TMP22]] to i64
; CHECK-INLINE-NEXT: [[TMP40:%.*]] = lshr i64 [[TMP21]], 3
; CHECK-INLINE-NEXT: [[TMP41:%.*]] = add i64 [[TMP40]], [[TMP1]]
; CHECK-INLINE-NEXT: [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
; CHECK-INLINE-NEXT: [[TMP43:%.*]] = load i8, ptr [[TMP42]], align 1
; CHECK-INLINE-NEXT: [[TMP44:%.*]] = icmp ne i8 [[TMP43]], 0
; CHECK-INLINE-NEXT: br i1 [[TMP44]], label [[TMP45:%.*]], label [[TMP50:%.*]], !prof [[PROF1:![0-9]+]]
-; CHECK-INLINE: 45:
+; CHECK-INLINE: 39:
; CHECK-INLINE-NEXT: [[TMP46:%.*]] = and i64 [[TMP21]], 7
; CHECK-INLINE-NEXT: [[TMP47:%.*]] = trunc i64 [[TMP46]] to i8
; CHECK-INLINE-NEXT: [[TMP48:%.*]] = icmp sge i8 [[TMP47]], [[TMP43]]
; CHECK-INLINE-NEXT: br i1 [[TMP48]], label [[TMP49:%.*]], label [[TMP50]]
-; CHECK-INLINE: 49:
+; CHECK-INLINE: 43:
; CHECK-INLINE-NEXT: call void @__asan_report_store1(i64 [[TMP21]]) #[[ATTR7:[0-9]+]]
; CHECK-INLINE-NEXT: unreachable
-; CHECK-INLINE: 50:
+; CHECK-INLINE: 44:
; CHECK-INLINE-NEXT: store volatile i8 0, ptr [[TMP22]], align 1
; CHECK-INLINE-NEXT: [[TMP51:%.*]] = add i64 [[TMP29]], 1066
; CHECK-INLINE-NEXT: call void @__asan_set_shadow_f8(i64 [[TMP51]], i64 1)
@@ -125,10 +119,10 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
; CHECK-INLINE-NEXT: [[TMP65:%.*]] = load i8, ptr [[TMP64]], align 1
; CHECK-INLINE-NEXT: [[TMP66:%.*]] = icmp ne i8 [[TMP65]], 0
; CHECK-INLINE-NEXT: br i1 [[TMP66]], label [[TMP67:%.*]], label [[TMP68:%.*]]
-; CHECK-INLINE: 67:
+; CHECK-INLINE: 61:
; CHECK-INLINE-NEXT: call void @__asan_report_store8(i64 [[TMP59]]) #[[ATTR7]]
; CHECK-INLINE-NEXT: unreachable
-; CHECK-INLINE: 68:
+; CHECK-INLINE: 62:
; CHECK-INLINE-NEXT: store volatile i64 0, ptr [[TMP61]], align 8
; CHECK-INLINE-NEXT: [[TMPCOPYI64:%.*]] = load i64, ptr [[TMP61]], align 8
; CHECK-INLINE-NEXT: [[TMP69:%.*]] = and i64 [[TMPCOPYI64]], 31
@@ -150,15 +144,15 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
; CHECK-INLINE-NEXT: [[TMP83:%.*]] = load i8, ptr [[TMP82]], align 1
; CHECK-INLINE-NEXT: [[TMP84:%.*]] = icmp ne i8 [[TMP83]], 0
; CHECK-INLINE-NEXT: br i1 [[TMP84]], label [[TMP85:%.*]], label [[TMP90:%.*]], !prof [[PROF1]]
-; CHECK-INLINE: 85:
+; CHECK-INLINE: 79:
; CHECK-INLINE-NEXT: [[TMP86:%.*]] = and i64 [[TMP77]], 7
; CHECK-INLINE-NEXT: [[TMP87:%.*]] = trunc i64 [[TMP86]] to i8
; CHECK-INLINE-NEXT: [[TMP88:%.*]] = icmp sge i8 [[TMP87]], [[TMP83]]
; CHECK-INLINE-NEXT: br i1 [[TMP88]], label [[TMP89:%.*]], label [[TMP90]]
-; CHECK-INLINE: 89:
+; CHECK-INLINE: 83:
; CHECK-INLINE-NEXT: call void @__asan_report_store1(i64 [[TMP77]]) #[[ATTR7]]
; CHECK-INLINE-NEXT: unreachable
-; CHECK-INLINE: 90:
+; CHECK-INLINE: 84:
; CHECK-INLINE-NEXT: store volatile i8 0, ptr [[TMP79]], align 1
; CHECK-INLINE-NEXT: invoke void @MayThrowFunc()
; CHECK-INLINE-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[EHCLEANUP:%.*]]
@@ -170,15 +164,15 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
; CHECK-INLINE-NEXT: store i64 1172321806, ptr [[TMP23]], align 8
; CHECK-INLINE-NEXT: [[TMP93:%.*]] = icmp ne i64 [[TMP7]], 0
; CHECK-INLINE-NEXT: br i1 [[TMP93]], label [[TMP94:%.*]], label [[TMP95:%.*]]
-; CHECK-INLINE: 94:
+; CHECK-INLINE: 88:
; CHECK-INLINE-NEXT: call void @__asan_stack_free_8(i64 [[TMP7]], i64 8544)
; CHECK-INLINE-NEXT: br label [[TMP97:%.*]]
-; CHECK-INLINE: 95:
+; CHECK-INLINE: 89:
; CHECK-INLINE-NEXT: call void @__asan_set_shadow_00(i64 [[TMP29]], i64 4)
; CHECK-INLINE-NEXT: [[TMP96:%.*]] = add i64 [[TMP29]], 1028
; CHECK-INLINE-NEXT: call void @__asan_set_shadow_00(i64 [[TMP96]], i64 40)
; CHECK-INLINE-NEXT: br label [[TMP97]]
-; CHECK-INLINE: 97:
+; CHECK-INLINE: 91:
; CHECK-INLINE-NEXT: ret void
; CHECK-INLINE: ehcleanup:
; CHECK-INLINE-NEXT: [[TMP98:%.*]] = cleanuppad within none []
@@ -189,23 +183,27 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
; CHECK-INLINE-NEXT: [[TMP102:%.*]] = load i8, ptr [[TMP101]], align 1
; CHECK-INLINE-NEXT: [[TMP103:%.*]] = icmp ne i8 [[TMP102]], 0
; CHECK-INLINE-NEXT: br i1 [[TMP103]], label [[TMP104:%.*]], label [[TMP109:%.*]], !prof [[PROF1]]
-; CHECK-INLINE: 104:
+; CHECK-INLINE: 98:
; CHECK-INLINE-NEXT: [[TMP105:%.*]] = and i64 [[TMP54]], 7
; CHECK-INLINE-NEXT: [[TMP106:%.*]] = trunc i64 [[TMP105]] to i8
; CHECK-INLINE-NEXT: [[TMP107:%.*]] = icmp sge i8 [[TMP106]], [[TMP102]]
; CHECK-INLINE-NEXT: br i1 [[TMP107]], label [[TMP108:%.*]], label [[TMP109]]
-; CHECK-INLINE: 108:
+; CHECK-INLINE: 102:
; CHECK-INLINE-NEXT: call void @__asan_report_store1(i64 [[TMP54]]) #[[ATTR7]] [ "funclet"(token [[TMP98]]) ]
; CHECK-INLINE-NEXT: unreachable
-; CHECK-INLINE: 109:
+; CHECK-INLINE: 103:
; CHECK-INLINE-NEXT: store volatile i8 0, ptr [[TMP56]], align 1
; CHECK-INLINE-NEXT: call void @__asan_poison_stack_memory(i64 [[TMP54]], i64 4) [ "funclet"(token [[TMP98]]) ]
; CHECK-INLINE-NEXT: call void @DeInit(ptr [[TMP14]]) [ "funclet"(token [[TMP98]]) ]
; CHECK-INLINE-NEXT: [[TMP110:%.*]] = call ptr @__asan_memset(ptr [[TMP16]], i32 0, i64 4) [ "funclet"(token [[TMP98]]) ]
; CHECK-INLINE-NEXT: [[TMP111:%.*]] = call ptr @__asan_memcpy(ptr [[TMP18]], ptr [[TMP16]], i64 4) [ "funclet"(token [[TMP98]]) ]
; CHECK-INLINE-NEXT: [[TMP112:%.*]] = call ptr @__asan_memmove(ptr [[TMP20]], ptr [[TMP16]], i64 4) [ "funclet"(token [[TMP98]]) ]
+; CHECK-INLINE-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[TMP16]] to i64
+; CHECK-INLINE-NEXT: [[TMP17:%.*]] = ptrtoint ptr [[TMP18]] to i64
; CHECK-INLINE-NEXT: call void @__sanitizer_ptr_cmp(i64 [[TMP15]], i64 [[TMP17]]) [ "funclet"(token [[TMP98]]) ]
-; CHECK-INLINE-NEXT: call void @__sanitizer_ptr_sub(i64 [[TMP15]], i64 [[TMP17]]) [ "funclet"(token [[TMP98]]) ]
+; CHECK-INLINE-NEXT: [[ADDR1:%.*]] = ptrtoint ptr [[TMP16]] to i64
+; CHECK-INLINE-NEXT: [[ADDR2:%.*]] = ptrtoint ptr [[TMP18]] to i64
+; CHECK-INLINE-NEXT: call void @__sanitizer_ptr_sub(i64 [[ADDR1]], i64 [[ADDR2]]) [ "funclet"(token [[TMP98]]) ]
; CHECK-INLINE-NEXT: [[TMP113:%.*]] = ptrtoint ptr [[PTRPARAM]] to i64
; CHECK-INLINE-NEXT: [[TMP114:%.*]] = add i64 [[TMP113]], 7
; CHECK-INLINE-NEXT: [[TMP115:%.*]] = inttoptr i64 [[TMP114]] to ptr
@@ -216,27 +214,27 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
; CHECK-INLINE-NEXT: [[TMP120:%.*]] = load i8, ptr [[TMP119]], align 1
; CHECK-INLINE-NEXT: [[TMP121:%.*]] = icmp ne i8 [[TMP120]], 0
; CHECK-INLINE-NEXT: br i1 [[TMP121]], label [[TMP122:%.*]], label [[TMP127:%.*]], !prof [[PROF1]]
-; CHECK-INLINE: 122:
+; CHECK-INLINE: 118:
; CHECK-INLINE-NEXT: [[TMP123:%.*]] = and i64 [[TMP116]], 7
; CHECK-INLINE-NEXT: [[TMP124:%.*]] = trunc i64 [[TMP123]] to i8
; CHECK-INLINE-NEXT: [[TMP125:%.*]] = icmp sge i8 [[TMP124]], [[TMP120]]
; CHECK-INLINE-NEXT: br i1 [[TMP125]], label [[TMP126:%.*]], label [[TMP127]]
-; CHECK-INLINE: 126:
+; CHECK-INLINE: 122:
; CHECK-INLINE-NEXT: call void @__asan_report_store_n(i64 [[TMP116]], i64 8) #[[ATTR7]] [ "funclet"(token [[TMP98]]) ]
; CHECK-INLINE-NEXT: unreachable
-; CHECK-INLINE: 127:
+; CHECK-INLINE: 123:
; CHECK-INLINE-NEXT: [[TMP128:%.*]] = lshr i64 [[TMP114]], 3
; CHECK-INLINE-NEXT: [[TMP129:%.*]] = add i64 [[TMP128]], [[TMP1]]
; CHECK-INLINE-NEXT: [[TMP130:%.*]] = inttoptr i64 [[TMP129]] to ptr
; CHECK-INLINE-NEXT: [[TMP131:%.*]] = load i8, ptr [[TMP130]], align 1
; CHECK-INLINE-NEXT: [[TMP132:%.*]] = icmp ne i8 [[TMP131]], 0
; CHECK-INLINE-NEXT: br i1 [[TMP132]], label [[TMP133:%.*]], label [[EHEXIT:%.*]], !prof [[PROF1]]
-; CHECK-INLINE: 133:
+; CHECK-INLINE: 129:
; CHECK-INLINE-NEXT: [[TMP134:%.*]] = and i64 [[TMP114]], 7
; CHECK-INLINE-NEXT: [[TMP135:%.*]] = trunc i64 [[TMP134]] to i8
; CHECK-INLINE-NEXT: [[TMP136:%.*]] = icmp sge i8 [[TMP135]], [[TMP131]]
; CHECK-INLINE-NEXT: br i1 [[TMP136]], label [[TMP137:%.*]], label [[EHEXIT]]
-; CHECK-INLINE: 137:
+; CHECK-INLINE: 133:
; CHECK-INLINE-NEXT: call void @__asan_report_store_n(i64 [[TMP114]], i64 8) #[[ATTR7]] [ "funclet"(token [[TMP98]]) ]
; CHECK-INLINE-NEXT: unreachable
; CHECK-INLINE: ehexit:
@@ -249,19 +247,19 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
; CHECK-INLINE-NEXT: store i64 1172321806, ptr [[TMP23]], align 8
; CHECK-INLINE-NEXT: [[TMP142:%.*]] = icmp ne i64 [[TMP7]], 0
; CHECK-INLINE-NEXT: br i1 [[TMP142]], label [[TMP143:%.*]], label [[TMP144:%.*]]
-; CHECK-INLINE: 143:
+; CHECK-INLINE: 139:
; CHECK-INLINE-NEXT: call void @__asan_stack_free_8(i64 [[TMP7]], i64 8544) [ "funclet"(token [[TMP98]]) ]
; CHECK-INLINE-NEXT: br label [[TMP146:%.*]]
-; CHECK-INLINE: 144:
+; CHECK-INLINE: 140:
; CHECK-INLINE-NEXT: call void @__asan_set_shadow_00(i64 [[TMP29]], i64 4) [ "funclet"(token [[TMP98]]) ]
; CHECK-INLINE-NEXT: [[TMP145:%.*]] = add i64 [[TMP29]], 1028
; CHECK-INLINE-NEXT: call void @__asan_set_shadow_00(i64 [[TMP145]], i64 40) [ "funclet"(token [[TMP98]]) ]
; CHECK-INLINE-NEXT: br label [[TMP146]]
-; CHECK-INLINE: 146:
+; CHECK-INLINE: 142:
; CHECK-INLINE-NEXT: cleanupret from [[TMP98]] unwind to caller
;
; CHECK-OUTLINE-LABEL: define void @FuncletPersonality(
-; CHECK-OUTLINE-SAME: ptr [[PTRPARAM:%.*]]) #[[ATTR3:[0-9]+]] personality ptr @__CxxFrameHandler3 {
+; CHECK-OUTLINE-SAME: ptr [[PTRPARAM:%.*]]) #[[ATTR2:[0-9]+]] personality ptr @__CxxFrameHandler3 {
; CHECK-OUTLINE-NEXT: entry:
; CHECK-OUTLINE-NEXT: [[TMP0:%.*]] = alloca i64, align 32
; CHECK-OUTLINE-NEXT: store i64 0, ptr [[TMP0]], align 8
@@ -275,37 +273,28 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
; CHECK-OUTLINE-NEXT: br label [[TMP6]]
; CHECK-OUTLINE: 6:
; CHECK-OUTLINE-NEXT: [[TMP7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[TMP4]] ]
+; CHECK-OUTLINE-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP7]] to ptr
; CHECK-OUTLINE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 0
; CHECK-OUTLINE-NEXT: br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]]
-; CHECK-OUTLINE: 9:
+; CHECK-OUTLINE: 10:
; CHECK-OUTLINE-NEXT: [[MYALLOCA:%.*]] = alloca i8, i64 8608, align 32
-; CHECK-OUTLINE-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64
; CHECK-OUTLINE-NEXT: br label [[TMP11]]
; CHECK-OUTLINE: 11:
-; CHECK-OUTLINE-NEXT: [[TMP12:%.*]] = phi i64 [ [[TMP7]], [[TMP6]] ], [ [[TMP10]], [[TMP9]] ]
-; CHECK-OUTLINE-NEXT: store i64 [[TMP12]], ptr [[ASAN_LOCAL_STACK_BASE]], align 8
-; CHECK-OUTLINE-NEXT: [[TMP13:%.*]] = add i64 [[TMP12]], 32
-; CHECK-OUTLINE-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-; CHECK-OUTLINE-NEXT: [[TMP15:%.*]] = add i64 [[TMP12]], 8480
-; CHECK-OUTLINE-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP15]] to ptr
-; CHECK-OUTLINE-NEXT: [[TMP17:%.*]] = add i64 [[TMP12]], 8496
-; CHECK-OUTLINE-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
-; CHECK-OUTLINE-NEXT: [[TMP19:%.*]] = add i64 [[TMP12]], 8512
-; CHECK-OUTLINE-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-; CHECK-OUTLINE-NEXT: [[TMP21:%.*]] = add i64 [[TMP12]], 8528
-; CHECK-OUTLINE-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
-; CHECK-OUTLINE-NEXT: [[TMP23:%.*]] = add i64 [[TMP12]], 8544
-; CHECK-OUTLINE-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-; CHECK-OUTLINE-NEXT: [[TMP25:%.*]] = add i64 [[TMP12]], 8560
-; CHECK-OUTLINE-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP25]] to ptr
-; CHECK-OUTLINE-NEXT: [[TMP27:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-OUTLINE-NEXT: [[TMP27:%.*]] = phi ptr [ [[TMP10]], [[TMP6]] ], [ [[MYALLOCA]], [[TMP9]] ]
+; CHECK-OUTLINE-NEXT: store ptr [[TMP27]], ptr [[ASAN_LOCAL_STACK_BASE]], align 8
+; CHECK-OUTLINE-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP27]], i64 32
+; CHECK-OUTLINE-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[TMP27]], i64 8480
+; CHECK-OUTLINE-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP27]], i64 8496
+; CHECK-OUTLINE-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP27]], i64 8512
+; CHECK-OUTLINE-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[TMP27]], i64 8528
+; CHECK-OUTLINE-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[TMP27]], i64 8544
+; CHECK-OUTLINE-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[TMP27]], i64 8560
; CHECK-OUTLINE-NEXT: store i64 1102416563, ptr [[TMP27]], align 8
-; CHECK-OUTLINE-NEXT: [[TMP28:%.*]] = add i64 [[TMP12]], 8
-; CHECK-OUTLINE-NEXT: [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+; CHECK-OUTLINE-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[TMP27]], i64 8
; CHECK-OUTLINE-NEXT: store i64 ptrtoint (ptr @___asan_gen_stack to i64), ptr [[TMP29]], align 8
-; CHECK-OUTLINE-NEXT: [[TMP30:%.*]] = add i64 [[TMP12]], 16
-; CHECK-OUTLINE-NEXT: [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+; CHECK-OUTLINE-NEXT: [[TMP31:%.*]] = getelementptr i8, ptr [[TMP27]], i64 16
; CHECK-OUTLINE-NEXT: store i64 ptrtoint (ptr @FuncletPersonality to i64), ptr [[TMP31]], align 8
+; CHECK-OUTLINE-NEXT: [[TMP12:%.*]] = ptrtoint ptr [[TMP27]] to i64
; CHECK-OUTLINE-NEXT: [[TMP32:%.*]] = lshr i64 [[TMP12]], 3
; CHECK-OUTLINE-NEXT: [[TMP33:%.*]] = add i64 [[TMP32]], [[TMP1]]
; CHECK-OUTLINE-NEXT: call void @__asan_set_shadow_f1(i64 [[TMP33]], i64 4)
@@ -335,10 +324,12 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
; CHECK-OUTLINE-NEXT: call void @__asan_set_shadow_f3(i64 [[TMP45]], i64 5)
; CHECK-OUTLINE-NEXT: [[TMP46:%.*]] = add i64 [[TMP33]], 1066
; CHECK-OUTLINE-NEXT: call void @__asan_set_shadow_04(i64 [[TMP46]], i64 1)
+; CHECK-OUTLINE-NEXT: [[TMP21:%.*]] = ptrtoint ptr [[TMP22]] to i64
; CHECK-OUTLINE-NEXT: call void @__asan_store1(i64 [[TMP21]])
; CHECK-OUTLINE-NEXT: store volatile i8 0, ptr [[TMP22]], align 1
; CHECK-OUTLINE-NEXT: [[TMP47:%.*]] = add i64 [[TMP33]], 1066
; CHECK-OUTLINE-NEXT: call void @__asan_set_shadow_f8(i64 [[TMP47]], i64 1)
+; CHECK-OUTLINE-NEXT: [[TMP25:%.*]] = ptrtoint ptr [[TMP26]] to i64
; CHECK-OUTLINE-NEXT: call void @__asan_store8(i64 [[TMP25]])
; CHECK-OUTLINE-NEXT: store volatile i64 0, ptr [[TMP26]], align 8
; CHECK-OUTLINE-NEXT: [[TMPCOPYI64:%.*]] = load i64, ptr [[TMP26]], align 8
@@ -367,22 +358,23 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
; CHECK-OUTLINE-NEXT: store i64 1172321806, ptr [[TMP27]], align 8
; CHECK-OUTLINE-NEXT: [[TMP61:%.*]] = icmp ne i64 [[TMP7]], 0
; CHECK-OUTLINE-NEXT: br i1 [[TMP61]], label [[TMP62:%.*]], label [[TMP63:%.*]]
-; CHECK-OUTLINE: 62:
+; CHECK-OUTLINE: 55:
; CHECK-OUTLINE-NEXT: call void @__asan_stack_free_8(i64 [[TMP7]], i64 8608)
; CHECK-OUTLINE-NEXT: br label [[TMP66:%.*]]
-; CHECK-OUTLINE: 63:
+; CHECK-OUTLINE: 56:
; CHECK-OUTLINE-NEXT: call void @__asan_set_shadow_00(i64 [[TMP33]], i64 4)
; CHECK-OUTLINE-NEXT: [[TMP64:%.*]] = add i64 [[TMP33]], 1028
; CHECK-OUTLINE-NEXT: call void @__asan_set_shadow_00(i64 [[TMP64]], i64 42)
; CHECK-OUTLINE-NEXT: [[TMP65:%.*]] = add i64 [[TMP33]], 1071
; CHECK-OUTLINE-NEXT: call void @__asan_set_shadow_00(i64 [[TMP65]], i64 5)
; CHECK-OUTLINE-NEXT: br label [[TMP66]]
-; CHECK-OUTLINE: 66:
+; CHECK-OUTLINE: 59:
; CHECK-OUTLINE-NEXT: ret void
; CHECK-OUTLINE: ehcleanup:
; CHECK-OUTLINE-NEXT: [[TMP67:%.*]] = cleanuppad within none []
; CHECK-OUTLINE-NEXT: [[TMP68:%.*]] = add i64 [[TMP33]], 1068
; CHECK-OUTLINE-NEXT: call void @__asan_set_shadow_04(i64 [[TMP68]], i64 1) [ "funclet"(token [[TMP67]]) ]
+; CHECK-OUTLINE-NEXT: [[TMP23:%.*]] = ptrtoint ptr [[TMP24]] to i64
; CHECK-OUTLINE-NEXT: call void @__asan_store1(i64 [[TMP23]]) [ "funclet"(token [[TMP67]]) ]
; CHECK-OUTLINE-NEXT: store volatile i8 0, ptr [[TMP24]], align 1
; CHECK-OUTLINE-NEXT: [[TMP69:%.*]] = add i64 [[TMP33]], 1068
@@ -391,8 +383,12 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
; CHECK-OUTLINE-NEXT: [[TMP70:%.*]] = call ptr @__asan_memset(ptr [[TMP16]], i32 0, i64 4) [ "funclet"(token [[TMP67]]) ]
; CHECK-OUTLINE-NEXT: [[TMP71:%.*]] = call ptr @__asan_memcpy(ptr [[TMP18]], ptr [[TMP16]], i64 4) [ "funclet"(token [[TMP67]]) ]
; CHECK-OUTLINE-NEXT: [[TMP72:%.*]] = call ptr @__asan_memmove(ptr [[TMP20]], ptr [[TMP16]], i64 4) [ "funclet"(token [[TMP67]]) ]
+; CHECK-OUTLINE-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[TMP16]] to i64
+; CHECK-OUTLINE-NEXT: [[TMP17:%.*]] = ptrtoint ptr [[TMP18]] to i64
; CHECK-OUTLINE-NEXT: call void @__sanitizer_ptr_cmp(i64 [[TMP15]], i64 [[TMP17]]) [ "funclet"(token [[TMP67]]) ]
-; CHECK-OUTLINE-NEXT: call void @__sanitizer_ptr_sub(i64 [[TMP15]], i64 [[TMP17]]) [ "funclet"(token [[TMP67]]) ]
+; CHECK-OUTLINE-NEXT: [[ADDR1:%.*]] = ptrtoint ptr [[TMP16]] to i64
+; CHECK-OUTLINE-NEXT: [[ADDR2:%.*]] = ptrtoint ptr [[TMP18]] to i64
+; CHECK-OUTLINE-NEXT: call void @__sanitizer_ptr_sub(i64 [[ADDR1]], i64 [[ADDR2]]) [ "funclet"(token [[TMP67]]) ]
; CHECK-OUTLINE-NEXT: [[TMP73:%.*]] = ptrtoint ptr [[PTRPARAM]] to i64
; CHECK-OUTLINE-NEXT: call void @__asan_storeN(i64 [[TMP73]], i64 8) [ "funclet"(token [[TMP67]]) ]
; CHECK-OUTLINE-NEXT: store i64 0, ptr [[PTRPARAM]], align 1
@@ -404,17 +400,17 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
; CHECK-OUTLINE-NEXT: store i64 1172321806, ptr [[TMP27]], align 8
; CHECK-OUTLINE-NEXT: [[TMP78:%.*]] = icmp ne i64 [[TMP7]], 0
; CHECK-OUTLINE-NEXT: br i1 [[TMP78]], label [[TMP79:%.*]], label [[TMP80:%.*]]
-; CHECK-OUTLINE: 79:
+; CHECK-OUTLINE: 75:
; CHECK-OUTLINE-NEXT: call void @__asan_stack_free_8(i64 [[TMP7]], i64 8608) [ "funclet"(token [[TMP67]]) ]
; CHECK-OUTLINE-NEXT: br label [[TMP83:%.*]]
-; CHECK-OUTLINE: 80:
+; CHECK-OUTLINE: 76:
; CHECK-OUTLINE-NEXT: call void @__asan_set_shadow_00(i64 [[TMP33]], i64 4) [ "funclet"(token [[TMP67]]) ]
; CHECK-OUTLINE-NEXT: [[TMP81:%.*]] = add i64 [[TMP33]], 1028
; CHECK-OUTLINE-NEXT: call void @__asan_set_shadow_00(i64 [[TMP81]], i64 42) [ "funclet"(token [[TMP67]]) ]
; CHECK-OUTLINE-NEXT: [[TMP82:%.*]] = add i64 [[TMP33]], 1071
; CHECK-OUTLINE-NEXT: call void @__asan_set_shadow_00(i64 [[TMP82]], i64 5) [ "funclet"(token [[TMP67]]) ]
; CHECK-OUTLINE-NEXT: br label [[TMP83]]
-; CHECK-OUTLINE: 83:
+; CHECK-OUTLINE: 79:
; CHECK-OUTLINE-NEXT: cleanupret from [[TMP67]] unwind to caller
;
@@ -487,7 +483,7 @@ nopredecessor:
; Non-Windows personality, ensure no funclet gets attached to asan runtime call.
define void @OtherPersonality(ptr %ptrParam) sanitize_address personality ptr @dummyPersonality {
; CHECK-LABEL: define void @OtherPersonality(
-; CHECK-SAME: ptr [[PTRPARAM:%.*]]) #[[ATTR3:[0-9]+]] personality ptr @dummyPersonality {
+; CHECK-SAME: ptr [[PTRPARAM:%.*]]) #[[ATTR2:[0-9]+]] personality ptr @dummyPersonality {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8
; CHECK-NEXT: [[ASAN_LOCAL_STACK_BASE:%.*]] = alloca i64, align 8
@@ -499,25 +495,22 @@ define void @OtherPersonality(ptr %ptrParam) sanitize_address personality ptr @d
; CHECK-NEXT: br label [[TMP5]]
; CHECK: 5:
; CHECK-NEXT: [[TMP6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP4]], [[TMP3]] ]
+; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP6]] to ptr
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[TMP6]], 0
; CHECK-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP10:%.*]]
-; CHECK: 8:
+; CHECK: 9:
; CHECK-NEXT: [[MYALLOCA:%.*]] = alloca i8, i64 64, align 32
-; CHECK-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64
; CHECK-NEXT: br label [[TMP10]]
; CHECK: 10:
-; CHECK-NEXT: [[TMP11:%.*]] = phi i64 [ [[TMP6]], [[TMP5]] ], [ [[TMP9]], [[TMP8]] ]
-; CHECK-NEXT: store i64 [[TMP11]], ptr [[ASAN_LOCAL_STACK_BASE]], align 8
-; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 32
-; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
-; CHECK-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT: [[TMP14:%.*]] = phi ptr [ [[TMP9]], [[TMP5]] ], [ [[MYALLOCA]], [[TMP8]] ]
+; CHECK-NEXT: store ptr [[TMP14]], ptr [[ASAN_LOCAL_STACK_BASE]], align 8
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP14]], i64 32
; CHECK-NEXT: store i64 1102416563, ptr [[TMP14]], align 8
-; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[TMP11]], 8
-; CHECK-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP15]] to ptr
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[TMP14]], i64 8
; CHECK-NEXT: store i64 ptrtoint (ptr @___asan_gen_stack.1 to i64), ptr [[TMP16]], align 8
-; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[TMP11]], 16
-; CHECK-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP14]], i64 16
; CHECK-NEXT: store i64 ptrtoint (ptr @OtherPersonality to i64), ptr [[TMP18]], align 8
+; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP14]] to i64
; CHECK-NEXT: [[TMP19:%.*]] = lshr i64 [[TMP11]], 3
; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[TMP19]], [[TMP0]]
; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[TMP20]], 0
@@ -532,14 +525,14 @@ define void @OtherPersonality(ptr %ptrParam) sanitize_address personality ptr @d
; CHECK-NEXT: store i64 1172321806, ptr [[TMP14]], align 8
; CHECK-NEXT: [[TMP24:%.*]] = icmp ne i64 [[TMP6]], 0
; CHECK-NEXT: br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP26:%.*]]
-; CHECK: 25:
+; CHECK: 22:
; CHECK-NEXT: call void @__asan_stack_free_0(i64 [[TMP6]], i64 64)
; CHECK-NEXT: br label [[TMP28:%.*]]
-; CHECK: 26:
+; CHECK: 23:
; CHECK-NEXT: [[TMP27:%.*]] = add i64 [[TMP20]], 0
; CHECK-NEXT: call void @__asan_set_shadow_00(i64 [[TMP27]], i64 8)
; CHECK-NEXT: br label [[TMP28]]
-; CHECK: 28:
+; CHECK: 25:
; CHECK-NEXT: ret void
; CHECK: ehcleanup:
; CHECK-NEXT: [[TMP29:%.*]] = cleanuppad within none []
@@ -547,14 +540,14 @@ define void @OtherPersonality(ptr %ptrParam) sanitize_address personality ptr @d
; CHECK-NEXT: store i64 1172321806, ptr [[TMP14]], align 8
; CHECK-NEXT: [[TMP31:%.*]] = icmp ne i64 [[TMP6]], 0
; CHECK-NEXT: br i1 [[TMP31]], label [[TMP32:%.*]], label [[TMP33:%.*]]
-; CHECK: 32:
+; CHECK: 29:
; CHECK-NEXT: call void @__asan_stack_free_0(i64 [[TMP6]], i64 64)
; CHECK-NEXT: br label [[TMP35:%.*]]
-; CHECK: 33:
+; CHECK: 30:
; CHECK-NEXT: [[TMP34:%.*]] = add i64 [[TMP20]], 0
; CHECK-NEXT: call void @__asan_set_shadow_00(i64 [[TMP34]], i64 8)
; CHECK-NEXT: br label [[TMP35]]
-; CHECK: 35:
+; CHECK: 32:
; CHECK-NEXT: cleanupret from [[TMP29]] unwind to caller
;
entry:
diff --git a/llvm/test/Instrumentation/AddressSanitizer/asan-win-dont-instrument-catchpad.ll b/llvm/test/Instrumentation/AddressSanitizer/asan-win-dont-instrument-catchpad.ll
new file mode 100644
index 0000000..e38da0b
--- /dev/null
+++ b/llvm/test/Instrumentation/AddressSanitizer/asan-win-dont-instrument-catchpad.ll
@@ -0,0 +1,63 @@
+; RUN: opt < %s -passes=asan -S | FileCheck %s
+; CHECK: %ex = alloca i32, align 4
+; CHECK: catchpad within %{{.*}} [ptr @"??_R0H@8", i32 0, ptr %ex]
+
+; This test ensures that catch parameters are not instrumented on Windows.
+
+; This file was generated using the following source
+;
+; ```C++
+; #include <exception>
+; #include <cstdio>
+;
+; int main() {
+; try {
+; throw 1;
+; } catch (const int ex) {
+; printf("%d\n", ex);
+; return -1;
+; }
+; return 0;
+; }
+;
+; ```
+; then running the following sequence of commands
+;
+; ```
+; clang.exe -g0 -O0 -emit-llvm -c main.cpp -o main.bc
+; llvm-extract.exe -func=main main.bc -o main_func.bc
+; llvm-dis.exe main_func.bc -o main_func_dis.ll
+; ```
+; and finally manually trimming the resulting `.ll` file to remove
+; unnecessary metadata, and manually adding the `sanitize_address` annotation;
+; needed for the ASan pass to run.
+
+target triple = "x86_64-pc-windows-msvc"
+
+@"??_R0H@8" = external global ptr
+
+; Function Attrs: sanitize_address
+define i32 @main() sanitize_address personality ptr @__CxxFrameHandler3 {
+entry:
+ %ex = alloca i32, align 4
+ invoke void @throw()
+ to label %unreachable unwind label %catch.dispatch
+
+catch.dispatch: ; preds = %entry
+ %0 = catchswitch within none [label %catch] unwind to caller
+
+catch: ; preds = %catch.dispatch
+ %1 = catchpad within %0 [ptr @"??_R0H@8", i32 0, ptr %ex]
+ call void @opaque() [ "funclet"(token %1) ]
+ catchret from %1 to label %return
+
+return: ; preds = %catch
+ ret i32 0
+
+unreachable: ; preds = %entry
+ unreachable
+}
+
+declare void @throw() noreturn
+declare void @opaque()
+declare i32 @__CxxFrameHandler3(...)
diff --git a/llvm/test/Instrumentation/AddressSanitizer/fake-stack.ll b/llvm/test/Instrumentation/AddressSanitizer/fake-stack.ll
index 3cccabb1..1b00cd8 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/fake-stack.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/fake-stack.ll
@@ -20,24 +20,20 @@ define void @Simple() uwtable sanitize_address {
; NEVER-LABEL: @Simple(
; NEVER-NEXT: entry:
; NEVER-NEXT: [[MYALLOCA:%.*]] = alloca i8, i64 64, align 32
-; NEVER-NEXT: [[TMP0:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64
-; NEVER-NEXT: [[TMP1:%.*]] = add i64 [[TMP0]], 32
-; NEVER-NEXT: [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-; NEVER-NEXT: [[TMP3:%.*]] = inttoptr i64 [[TMP0]] to ptr
-; NEVER-NEXT: store i64 1102416563, ptr [[TMP3]], align 8
-; NEVER-NEXT: [[TMP4:%.*]] = add i64 [[TMP0]], 8
-; NEVER-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; NEVER-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[MYALLOCA]], i64 32
+; NEVER-NEXT: store i64 1102416563, ptr [[MYALLOCA]], align 8
+; NEVER-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[MYALLOCA]], i64 8
; NEVER-NEXT: store i64 ptrtoint (ptr @___asan_gen_stack to i64), ptr [[TMP5]], align 8
-; NEVER-NEXT: [[TMP6:%.*]] = add i64 [[TMP0]], 16
-; NEVER-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; NEVER-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[MYALLOCA]], i64 16
; NEVER-NEXT: store i64 ptrtoint (ptr @Simple to i64), ptr [[TMP7]], align 8
+; NEVER-NEXT: [[TMP0:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64
; NEVER-NEXT: [[TMP8:%.*]] = lshr i64 [[TMP0]], 3
; NEVER-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 2147450880
; NEVER-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 0
; NEVER-NEXT: [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
; NEVER-NEXT: store i64 -868083113472691727, ptr [[TMP11]], align 1
; NEVER-NEXT: call void @Foo(ptr [[TMP2]])
-; NEVER-NEXT: store i64 1172321806, ptr [[TMP3]], align 8
+; NEVER-NEXT: store i64 1172321806, ptr [[MYALLOCA]], align 8
; NEVER-NEXT: [[TMP12:%.*]] = add i64 [[TMP9]], 0
; NEVER-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
; NEVER-NEXT: store i64 0, ptr [[TMP13]], align 1
@@ -54,25 +50,22 @@ define void @Simple() uwtable sanitize_address {
; RUNTIME-NEXT: br label [[TMP4]]
; RUNTIME: 4:
; RUNTIME-NEXT: [[TMP5:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP3]], [[TMP2]] ]
+; RUNTIME-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP5]] to ptr
; RUNTIME-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 0
; RUNTIME-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
-; RUNTIME: 7:
+; RUNTIME: 8:
; RUNTIME-NEXT: [[MYALLOCA:%.*]] = alloca i8, i64 64, align 32
-; RUNTIME-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64
; RUNTIME-NEXT: br label [[TMP9]]
; RUNTIME: 9:
-; RUNTIME-NEXT: [[TMP10:%.*]] = phi i64 [ [[TMP5]], [[TMP4]] ], [ [[TMP8]], [[TMP7]] ]
-; RUNTIME-NEXT: store i64 [[TMP10]], ptr [[ASAN_LOCAL_STACK_BASE]], align 8
-; RUNTIME-NEXT: [[TMP11:%.*]] = add i64 [[TMP10]], 32
-; RUNTIME-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-; RUNTIME-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP10]] to ptr
+; RUNTIME-NEXT: [[TMP13:%.*]] = phi ptr [ [[TMP8]], [[TMP4]] ], [ [[MYALLOCA]], [[TMP7]] ]
+; RUNTIME-NEXT: store ptr [[TMP13]], ptr [[ASAN_LOCAL_STACK_BASE]], align 8
+; RUNTIME-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP13]], i64 32
; RUNTIME-NEXT: store i64 1102416563, ptr [[TMP13]], align 8
-; RUNTIME-NEXT: [[TMP14:%.*]] = add i64 [[TMP10]], 8
-; RUNTIME-NEXT: [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+; RUNTIME-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP13]], i64 8
; RUNTIME-NEXT: store i64 ptrtoint (ptr @___asan_gen_stack to i64), ptr [[TMP15]], align 8
-; RUNTIME-NEXT: [[TMP16:%.*]] = add i64 [[TMP10]], 16
-; RUNTIME-NEXT: [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
+; RUNTIME-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 16
; RUNTIME-NEXT: store i64 ptrtoint (ptr @Simple to i64), ptr [[TMP17]], align 8
+; RUNTIME-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[TMP13]] to i64
; RUNTIME-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP10]], 3
; RUNTIME-NEXT: [[TMP19:%.*]] = add i64 [[TMP18]], 2147450880
; RUNTIME-NEXT: [[TMP20:%.*]] = add i64 [[TMP19]], 0
@@ -82,47 +75,43 @@ define void @Simple() uwtable sanitize_address {
; RUNTIME-NEXT: store i64 1172321806, ptr [[TMP13]], align 8
; RUNTIME-NEXT: [[TMP22:%.*]] = icmp ne i64 [[TMP5]], 0
; RUNTIME-NEXT: br i1 [[TMP22]], label [[TMP23:%.*]], label [[TMP30:%.*]]
-; RUNTIME: 23:
+; RUNTIME: 20:
; RUNTIME-NEXT: [[TMP24:%.*]] = add i64 [[TMP19]], 0
; RUNTIME-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
; RUNTIME-NEXT: store i64 -723401728380766731, ptr [[TMP25]], align 1
-; RUNTIME-NEXT: [[TMP26:%.*]] = add i64 [[TMP5]], 56
-; RUNTIME-NEXT: [[TMP27:%.*]] = inttoptr i64 [[TMP26]] to ptr
+; RUNTIME-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[TMP8]], i64 56
; RUNTIME-NEXT: [[TMP28:%.*]] = load i64, ptr [[TMP27]], align 8
; RUNTIME-NEXT: [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
; RUNTIME-NEXT: store i8 0, ptr [[TMP29]], align 1
; RUNTIME-NEXT: br label [[TMP33:%.*]]
-; RUNTIME: 30:
+; RUNTIME: 26:
; RUNTIME-NEXT: [[TMP31:%.*]] = add i64 [[TMP19]], 0
; RUNTIME-NEXT: [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
; RUNTIME-NEXT: store i64 0, ptr [[TMP32]], align 1
; RUNTIME-NEXT: br label [[TMP33]]
-; RUNTIME: 33:
+; RUNTIME: 29:
; RUNTIME-NEXT: ret void
;
; ALWAYS-LABEL: @Simple(
; ALWAYS-NEXT: entry:
; ALWAYS-NEXT: [[ASAN_LOCAL_STACK_BASE:%.*]] = alloca i64, align 8
; ALWAYS-NEXT: [[TMP0:%.*]] = call i64 @__asan_stack_malloc_always_0(i64 64)
+; ALWAYS-NEXT: [[TMP3:%.*]] = inttoptr i64 [[TMP0]] to ptr
; ALWAYS-NEXT: [[TMP1:%.*]] = icmp eq i64 [[TMP0]], 0
; ALWAYS-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP4:%.*]]
-; ALWAYS: 2:
+; ALWAYS: 3:
; ALWAYS-NEXT: [[MYALLOCA:%.*]] = alloca i8, i64 64, align 32
-; ALWAYS-NEXT: [[TMP3:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64
; ALWAYS-NEXT: br label [[TMP4]]
; ALWAYS: 4:
-; ALWAYS-NEXT: [[TMP5:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP3]], [[TMP2]] ]
-; ALWAYS-NEXT: store i64 [[TMP5]], ptr [[ASAN_LOCAL_STACK_BASE]], align 8
-; ALWAYS-NEXT: [[TMP6:%.*]] = add i64 [[TMP5]], 32
-; ALWAYS-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; ALWAYS-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; ALWAYS-NEXT: [[TMP8:%.*]] = phi ptr [ [[TMP3]], [[ENTRY:%.*]] ], [ [[MYALLOCA]], [[TMP2]] ]
+; ALWAYS-NEXT: store ptr [[TMP8]], ptr [[ASAN_LOCAL_STACK_BASE]], align 8
+; ALWAYS-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP8]], i64 32
; ALWAYS-NEXT: store i64 1102416563, ptr [[TMP8]], align 8
-; ALWAYS-NEXT: [[TMP9:%.*]] = add i64 [[TMP5]], 8
-; ALWAYS-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
+; ALWAYS-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP8]], i64 8
; ALWAYS-NEXT: store i64 ptrtoint (ptr @___asan_gen_stack to i64), ptr [[TMP10]], align 8
-; ALWAYS-NEXT: [[TMP11:%.*]] = add i64 [[TMP5]], 16
-; ALWAYS-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; ALWAYS-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP8]], i64 16
; ALWAYS-NEXT: store i64 ptrtoint (ptr @Simple to i64), ptr [[TMP12]], align 8
+; ALWAYS-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[TMP8]] to i64
; ALWAYS-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP5]], 3
; ALWAYS-NEXT: [[TMP14:%.*]] = add i64 [[TMP13]], 2147450880
; ALWAYS-NEXT: [[TMP15:%.*]] = add i64 [[TMP14]], 0
@@ -132,22 +121,21 @@ define void @Simple() uwtable sanitize_address {
; ALWAYS-NEXT: store i64 1172321806, ptr [[TMP8]], align 8
; ALWAYS-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP0]], 0
; ALWAYS-NEXT: br i1 [[TMP17]], label [[TMP18:%.*]], label [[TMP25:%.*]]
-; ALWAYS: 18:
+; ALWAYS: 15:
; ALWAYS-NEXT: [[TMP19:%.*]] = add i64 [[TMP14]], 0
; ALWAYS-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
; ALWAYS-NEXT: store i64 -723401728380766731, ptr [[TMP20]], align 1
-; ALWAYS-NEXT: [[TMP21:%.*]] = add i64 [[TMP0]], 56
-; ALWAYS-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
+; ALWAYS-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[TMP3]], i64 56
; ALWAYS-NEXT: [[TMP23:%.*]] = load i64, ptr [[TMP22]], align 8
; ALWAYS-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
; ALWAYS-NEXT: store i8 0, ptr [[TMP24]], align 1
; ALWAYS-NEXT: br label [[TMP28:%.*]]
-; ALWAYS: 25:
+; ALWAYS: 21:
; ALWAYS-NEXT: [[TMP26:%.*]] = add i64 [[TMP14]], 0
; ALWAYS-NEXT: [[TMP27:%.*]] = inttoptr i64 [[TMP26]] to ptr
; ALWAYS-NEXT: store i64 0, ptr [[TMP27]], align 1
; ALWAYS-NEXT: br label [[TMP28]]
-; ALWAYS: 28:
+; ALWAYS: 24:
; ALWAYS-NEXT: ret void
;
entry:
@@ -160,17 +148,13 @@ define void @Huge() uwtable sanitize_address {
; CHECK-LABEL: @Huge(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[MYALLOCA:%.*]] = alloca i8, i64 100288, align 32
-; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[TMP0]], 32
-; CHECK-NEXT: [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-; CHECK-NEXT: [[TMP3:%.*]] = inttoptr i64 [[TMP0]] to ptr
-; CHECK-NEXT: store i64 1102416563, ptr [[TMP3]], align 8
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[TMP0]], 8
-; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[MYALLOCA]], i64 32
+; CHECK-NEXT: store i64 1102416563, ptr [[MYALLOCA]], align 8
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[MYALLOCA]], i64 8
; CHECK-NEXT: store i64 ptrtoint (ptr @___asan_gen_stack.1 to i64), ptr [[TMP5]], align 8
-; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[TMP0]], 16
-; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[MYALLOCA]], i64 16
; CHECK-NEXT: store i64 ptrtoint (ptr @Huge to i64), ptr [[TMP7]], align 8
+; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64
; CHECK-NEXT: [[TMP8:%.*]] = lshr i64 [[TMP0]], 3
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 2147450880
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 0
@@ -189,7 +173,7 @@ define void @Huge() uwtable sanitize_address {
; CHECK-NEXT: [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr
; CHECK-NEXT: store i64 -868082074056920077, ptr [[TMP19]], align 1
; CHECK-NEXT: call void @Foo(ptr [[TMP2]])
-; CHECK-NEXT: store i64 1172321806, ptr [[TMP3]], align 8
+; CHECK-NEXT: store i64 1172321806, ptr [[MYALLOCA]], align 8
; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[TMP9]], 0
; CHECK-NEXT: [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
; CHECK-NEXT: store i32 0, ptr [[TMP21]], align 1
diff --git a/llvm/test/Instrumentation/AddressSanitizer/lifetime.ll b/llvm/test/Instrumentation/AddressSanitizer/lifetime.ll
index d1e0180..82e114e 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/lifetime.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/lifetime.ll
@@ -16,17 +16,13 @@ define void @lifetime() sanitize_address {
; CHECK-DEFAULT-NEXT: [[TMP1:%.*]] = alloca i64, align 32
; CHECK-DEFAULT-NEXT: store i64 0, ptr [[TMP1]], align 8
; CHECK-DEFAULT-NEXT: [[MYALLOCA:%.*]] = alloca i8, i64 64, align 32
-; CHECK-DEFAULT-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64
-; CHECK-DEFAULT-NEXT: [[TMP3:%.*]] = add i64 [[TMP2]], 32
-; CHECK-DEFAULT-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; CHECK-DEFAULT-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP2]] to ptr
-; CHECK-DEFAULT-NEXT: store i64 1102416563, ptr [[TMP5]], align 8
-; CHECK-DEFAULT-NEXT: [[TMP6:%.*]] = add i64 [[TMP2]], 8
-; CHECK-DEFAULT-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-DEFAULT-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[MYALLOCA]], i64 32
+; CHECK-DEFAULT-NEXT: store i64 1102416563, ptr [[MYALLOCA]], align 8
+; CHECK-DEFAULT-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[MYALLOCA]], i64 8
; CHECK-DEFAULT-NEXT: store i64 ptrtoint (ptr @___asan_gen_stack to i64), ptr [[TMP7]], align 8
-; CHECK-DEFAULT-NEXT: [[TMP8:%.*]] = add i64 [[TMP2]], 16
-; CHECK-DEFAULT-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-DEFAULT-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[MYALLOCA]], i64 16
; CHECK-DEFAULT-NEXT: store i64 ptrtoint (ptr @lifetime to i64), ptr [[TMP9]], align 8
+; CHECK-DEFAULT-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64
; CHECK-DEFAULT-NEXT: [[TMP10:%.*]] = lshr i64 [[TMP2]], 3
; CHECK-DEFAULT-NEXT: [[TMP11:%.*]] = add i64 [[TMP10]], 2147450880
; CHECK-DEFAULT-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0
@@ -41,16 +37,16 @@ define void @lifetime() sanitize_address {
; CHECK-DEFAULT-NEXT: [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr
; CHECK-DEFAULT-NEXT: [[TMP20:%.*]] = load i8, ptr [[TMP19]], align 1
; CHECK-DEFAULT-NEXT: [[TMP21:%.*]] = icmp ne i8 [[TMP20]], 0
-; CHECK-DEFAULT-NEXT: br i1 [[TMP21]], label %[[BB22:.*]], label %[[BB27:.*]], !prof [[PROF1:![0-9]+]]
-; CHECK-DEFAULT: [[BB22]]:
+; CHECK-DEFAULT-NEXT: br i1 [[TMP21]], label %[[BB18:.*]], label %[[BB23:.*]], !prof [[PROF1:![0-9]+]]
+; CHECK-DEFAULT: [[BB18]]:
; CHECK-DEFAULT-NEXT: [[TMP23:%.*]] = and i64 [[TMP16]], 7
; CHECK-DEFAULT-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i8
; CHECK-DEFAULT-NEXT: [[TMP25:%.*]] = icmp sge i8 [[TMP24]], [[TMP20]]
-; CHECK-DEFAULT-NEXT: br i1 [[TMP25]], label %[[BB26:.*]], label %[[BB27]]
-; CHECK-DEFAULT: [[BB26]]:
+; CHECK-DEFAULT-NEXT: br i1 [[TMP25]], label %[[BB22:.*]], label %[[BB23]]
+; CHECK-DEFAULT: [[BB22]]:
; CHECK-DEFAULT-NEXT: call void @__asan_report_store1(i64 [[TMP16]]) #[[ATTR4:[0-9]+]]
; CHECK-DEFAULT-NEXT: unreachable
-; CHECK-DEFAULT: [[BB27]]:
+; CHECK-DEFAULT: [[BB23]]:
; CHECK-DEFAULT-NEXT: store volatile i8 0, ptr [[TMP4]], align 1
; CHECK-DEFAULT-NEXT: [[TMP28:%.*]] = add i64 [[TMP11]], 4
; CHECK-DEFAULT-NEXT: [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
@@ -73,16 +69,16 @@ define void @lifetime() sanitize_address {
; CHECK-DEFAULT-NEXT: [[TMP41:%.*]] = inttoptr i64 [[TMP40]] to ptr
; CHECK-DEFAULT-NEXT: [[TMP42:%.*]] = load i8, ptr [[TMP41]], align 1
; CHECK-DEFAULT-NEXT: [[TMP43:%.*]] = icmp ne i8 [[TMP42]], 0
-; CHECK-DEFAULT-NEXT: br i1 [[TMP43]], label %[[BB44:.*]], label %[[BB49:.*]], !prof [[PROF1]]
-; CHECK-DEFAULT: [[BB44]]:
+; CHECK-DEFAULT-NEXT: br i1 [[TMP43]], label %[[BB40:.*]], label %[[BB45:.*]], !prof [[PROF1]]
+; CHECK-DEFAULT: [[BB40]]:
; CHECK-DEFAULT-NEXT: [[TMP45:%.*]] = and i64 [[TMP38]], 7
; CHECK-DEFAULT-NEXT: [[TMP46:%.*]] = trunc i64 [[TMP45]] to i8
; CHECK-DEFAULT-NEXT: [[TMP47:%.*]] = icmp sge i8 [[TMP46]], [[TMP42]]
-; CHECK-DEFAULT-NEXT: br i1 [[TMP47]], label %[[BB48:.*]], label %[[BB49]]
-; CHECK-DEFAULT: [[BB48]]:
+; CHECK-DEFAULT-NEXT: br i1 [[TMP47]], label %[[BB44:.*]], label %[[BB45]]
+; CHECK-DEFAULT: [[BB44]]:
; CHECK-DEFAULT-NEXT: call void @__asan_report_store1(i64 [[TMP38]]) #[[ATTR4]]
; CHECK-DEFAULT-NEXT: unreachable
-; CHECK-DEFAULT: [[BB49]]:
+; CHECK-DEFAULT: [[BB45]]:
; CHECK-DEFAULT-NEXT: store volatile i8 0, ptr [[TMP36]], align 1
; CHECK-DEFAULT-NEXT: [[TMP50:%.*]] = ptrtoint ptr [[TMP36]] to i64
; CHECK-DEFAULT-NEXT: call void @__asan_poison_stack_memory(i64 [[TMP50]], i64 40)
@@ -95,16 +91,16 @@ define void @lifetime() sanitize_address {
; CHECK-DEFAULT-NEXT: [[TMP56:%.*]] = inttoptr i64 [[TMP55]] to ptr
; CHECK-DEFAULT-NEXT: [[TMP57:%.*]] = load i8, ptr [[TMP56]], align 1
; CHECK-DEFAULT-NEXT: [[TMP58:%.*]] = icmp ne i8 [[TMP57]], 0
-; CHECK-DEFAULT-NEXT: br i1 [[TMP58]], label %[[BB59:.*]], label %[[BB64:.*]], !prof [[PROF1]]
-; CHECK-DEFAULT: [[BB59]]:
+; CHECK-DEFAULT-NEXT: br i1 [[TMP58]], label %[[BB55:.*]], label %[[BB60:.*]], !prof [[PROF1]]
+; CHECK-DEFAULT: [[BB55]]:
; CHECK-DEFAULT-NEXT: [[TMP60:%.*]] = and i64 [[TMP53]], 7
; CHECK-DEFAULT-NEXT: [[TMP61:%.*]] = trunc i64 [[TMP60]] to i8
; CHECK-DEFAULT-NEXT: [[TMP62:%.*]] = icmp sge i8 [[TMP61]], [[TMP57]]
-; CHECK-DEFAULT-NEXT: br i1 [[TMP62]], label %[[BB63:.*]], label %[[BB64]]
-; CHECK-DEFAULT: [[BB63]]:
+; CHECK-DEFAULT-NEXT: br i1 [[TMP62]], label %[[BB59:.*]], label %[[BB60]]
+; CHECK-DEFAULT: [[BB59]]:
; CHECK-DEFAULT-NEXT: call void @__asan_report_store1(i64 [[TMP53]]) #[[ATTR4]]
; CHECK-DEFAULT-NEXT: unreachable
-; CHECK-DEFAULT: [[BB64]]:
+; CHECK-DEFAULT: [[BB60]]:
; CHECK-DEFAULT-NEXT: store volatile i8 0, ptr [[TMP4]], align 1
; CHECK-DEFAULT-NEXT: [[TMP65:%.*]] = add i64 [[TMP11]], 4
; CHECK-DEFAULT-NEXT: [[TMP66:%.*]] = inttoptr i64 [[TMP65]] to ptr
@@ -112,7 +108,7 @@ define void @lifetime() sanitize_address {
; CHECK-DEFAULT-NEXT: [[TMP67:%.*]] = ptrtoint ptr [[TMP1]] to i64
; CHECK-DEFAULT-NEXT: [[TMP68:%.*]] = load i64, ptr [[TMP1]], align 8
; CHECK-DEFAULT-NEXT: call void @__asan_allocas_unpoison(i64 [[TMP68]], i64 [[TMP67]])
-; CHECK-DEFAULT-NEXT: store i64 1172321806, ptr [[TMP5]], align 8
+; CHECK-DEFAULT-NEXT: store i64 1172321806, ptr [[MYALLOCA]], align 8
; CHECK-DEFAULT-NEXT: [[TMP69:%.*]] = add i64 [[TMP11]], 0
; CHECK-DEFAULT-NEXT: [[TMP70:%.*]] = inttoptr i64 [[TMP69]] to ptr
; CHECK-DEFAULT-NEXT: store i64 0, ptr [[TMP70]], align 1
@@ -121,17 +117,13 @@ define void @lifetime() sanitize_address {
; CHECK-NO-DYNAMIC-LABEL: define void @lifetime(
; CHECK-NO-DYNAMIC-SAME: ) #[[ATTR0:[0-9]+]] {
; CHECK-NO-DYNAMIC-NEXT: [[MYALLOCA:%.*]] = alloca i8, i64 64, align 32
-; CHECK-NO-DYNAMIC-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64
-; CHECK-NO-DYNAMIC-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], 32
-; CHECK-NO-DYNAMIC-NEXT: [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-; CHECK-NO-DYNAMIC-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP1]] to ptr
-; CHECK-NO-DYNAMIC-NEXT: store i64 1102416563, ptr [[TMP4]], align 8
-; CHECK-NO-DYNAMIC-NEXT: [[TMP5:%.*]] = add i64 [[TMP1]], 8
-; CHECK-NO-DYNAMIC-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NO-DYNAMIC-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[MYALLOCA]], i64 32
+; CHECK-NO-DYNAMIC-NEXT: store i64 1102416563, ptr [[MYALLOCA]], align 8
+; CHECK-NO-DYNAMIC-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[MYALLOCA]], i64 8
; CHECK-NO-DYNAMIC-NEXT: store i64 ptrtoint (ptr @___asan_gen_stack to i64), ptr [[TMP6]], align 8
-; CHECK-NO-DYNAMIC-NEXT: [[TMP7:%.*]] = add i64 [[TMP1]], 16
-; CHECK-NO-DYNAMIC-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NO-DYNAMIC-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[MYALLOCA]], i64 16
; CHECK-NO-DYNAMIC-NEXT: store i64 ptrtoint (ptr @lifetime to i64), ptr [[TMP8]], align 8
+; CHECK-NO-DYNAMIC-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64
; CHECK-NO-DYNAMIC-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP1]], 3
; CHECK-NO-DYNAMIC-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 2147450880
; CHECK-NO-DYNAMIC-NEXT: [[TMP11:%.*]] = add i64 [[TMP10]], 0
@@ -146,16 +138,16 @@ define void @lifetime() sanitize_address {
; CHECK-NO-DYNAMIC-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
; CHECK-NO-DYNAMIC-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
; CHECK-NO-DYNAMIC-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP19]], 0
-; CHECK-NO-DYNAMIC-NEXT: br i1 [[TMP20]], label %[[BB21:.*]], label %[[BB26:.*]], !prof [[PROF1:![0-9]+]]
-; CHECK-NO-DYNAMIC: [[BB21]]:
+; CHECK-NO-DYNAMIC-NEXT: br i1 [[TMP20]], label %[[BB17:.*]], label %[[BB22:.*]], !prof [[PROF1:![0-9]+]]
+; CHECK-NO-DYNAMIC: [[BB17]]:
; CHECK-NO-DYNAMIC-NEXT: [[TMP22:%.*]] = and i64 [[TMP15]], 7
; CHECK-NO-DYNAMIC-NEXT: [[TMP23:%.*]] = trunc i64 [[TMP22]] to i8
; CHECK-NO-DYNAMIC-NEXT: [[TMP24:%.*]] = icmp sge i8 [[TMP23]], [[TMP19]]
-; CHECK-NO-DYNAMIC-NEXT: br i1 [[TMP24]], label %[[BB25:.*]], label %[[BB26]]
-; CHECK-NO-DYNAMIC: [[BB25]]:
+; CHECK-NO-DYNAMIC-NEXT: br i1 [[TMP24]], label %[[BB21:.*]], label %[[BB22]]
+; CHECK-NO-DYNAMIC: [[BB21]]:
; CHECK-NO-DYNAMIC-NEXT: call void @__asan_report_store1(i64 [[TMP15]]) #[[ATTR4:[0-9]+]]
; CHECK-NO-DYNAMIC-NEXT: unreachable
-; CHECK-NO-DYNAMIC: [[BB26]]:
+; CHECK-NO-DYNAMIC: [[BB22]]:
; CHECK-NO-DYNAMIC-NEXT: store volatile i8 0, ptr [[TMP3]], align 1
; CHECK-NO-DYNAMIC-NEXT: [[TMP27:%.*]] = add i64 [[TMP10]], 4
; CHECK-NO-DYNAMIC-NEXT: [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
@@ -171,16 +163,16 @@ define void @lifetime() sanitize_address {
; CHECK-NO-DYNAMIC-NEXT: [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr
; CHECK-NO-DYNAMIC-NEXT: [[TMP35:%.*]] = load i8, ptr [[TMP34]], align 1
; CHECK-NO-DYNAMIC-NEXT: [[TMP36:%.*]] = icmp ne i8 [[TMP35]], 0
-; CHECK-NO-DYNAMIC-NEXT: br i1 [[TMP36]], label %[[BB37:.*]], label %[[BB42:.*]], !prof [[PROF1]]
-; CHECK-NO-DYNAMIC: [[BB37]]:
+; CHECK-NO-DYNAMIC-NEXT: br i1 [[TMP36]], label %[[BB33:.*]], label %[[BB38:.*]], !prof [[PROF1]]
+; CHECK-NO-DYNAMIC: [[BB33]]:
; CHECK-NO-DYNAMIC-NEXT: [[TMP38:%.*]] = and i64 [[TMP31]], 7
; CHECK-NO-DYNAMIC-NEXT: [[TMP39:%.*]] = trunc i64 [[TMP38]] to i8
; CHECK-NO-DYNAMIC-NEXT: [[TMP40:%.*]] = icmp sge i8 [[TMP39]], [[TMP35]]
-; CHECK-NO-DYNAMIC-NEXT: br i1 [[TMP40]], label %[[BB41:.*]], label %[[BB42]]
-; CHECK-NO-DYNAMIC: [[BB41]]:
+; CHECK-NO-DYNAMIC-NEXT: br i1 [[TMP40]], label %[[BB37:.*]], label %[[BB38]]
+; CHECK-NO-DYNAMIC: [[BB37]]:
; CHECK-NO-DYNAMIC-NEXT: call void @__asan_report_store1(i64 [[TMP31]]) #[[ATTR4]]
; CHECK-NO-DYNAMIC-NEXT: unreachable
-; CHECK-NO-DYNAMIC: [[BB42]]:
+; CHECK-NO-DYNAMIC: [[BB38]]:
; CHECK-NO-DYNAMIC-NEXT: store volatile i8 0, ptr [[ARR]], align 1
; CHECK-NO-DYNAMIC-NEXT: call void @llvm.lifetime.end.p0(ptr [[ARR]])
; CHECK-NO-DYNAMIC-NEXT: [[TMP43:%.*]] = add i64 [[TMP10]], 4
@@ -192,21 +184,21 @@ define void @lifetime() sanitize_address {
; CHECK-NO-DYNAMIC-NEXT: [[TMP48:%.*]] = inttoptr i64 [[TMP47]] to ptr
; CHECK-NO-DYNAMIC-NEXT: [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1
; CHECK-NO-DYNAMIC-NEXT: [[TMP50:%.*]] = icmp ne i8 [[TMP49]], 0
-; CHECK-NO-DYNAMIC-NEXT: br i1 [[TMP50]], label %[[BB51:.*]], label %[[BB56:.*]], !prof [[PROF1]]
-; CHECK-NO-DYNAMIC: [[BB51]]:
+; CHECK-NO-DYNAMIC-NEXT: br i1 [[TMP50]], label %[[BB47:.*]], label %[[BB52:.*]], !prof [[PROF1]]
+; CHECK-NO-DYNAMIC: [[BB47]]:
; CHECK-NO-DYNAMIC-NEXT: [[TMP52:%.*]] = and i64 [[TMP45]], 7
; CHECK-NO-DYNAMIC-NEXT: [[TMP53:%.*]] = trunc i64 [[TMP52]] to i8
; CHECK-NO-DYNAMIC-NEXT: [[TMP54:%.*]] = icmp sge i8 [[TMP53]], [[TMP49]]
-; CHECK-NO-DYNAMIC-NEXT: br i1 [[TMP54]], label %[[BB55:.*]], label %[[BB56]]
-; CHECK-NO-DYNAMIC: [[BB55]]:
+; CHECK-NO-DYNAMIC-NEXT: br i1 [[TMP54]], label %[[BB51:.*]], label %[[BB52]]
+; CHECK-NO-DYNAMIC: [[BB51]]:
; CHECK-NO-DYNAMIC-NEXT: call void @__asan_report_store1(i64 [[TMP45]]) #[[ATTR4]]
; CHECK-NO-DYNAMIC-NEXT: unreachable
-; CHECK-NO-DYNAMIC: [[BB56]]:
+; CHECK-NO-DYNAMIC: [[BB52]]:
; CHECK-NO-DYNAMIC-NEXT: store volatile i8 0, ptr [[TMP3]], align 1
; CHECK-NO-DYNAMIC-NEXT: [[TMP57:%.*]] = add i64 [[TMP10]], 4
; CHECK-NO-DYNAMIC-NEXT: [[TMP58:%.*]] = inttoptr i64 [[TMP57]] to ptr
; CHECK-NO-DYNAMIC-NEXT: store i8 -8, ptr [[TMP58]], align 1
-; CHECK-NO-DYNAMIC-NEXT: store i64 1172321806, ptr [[TMP4]], align 8
+; CHECK-NO-DYNAMIC-NEXT: store i64 1172321806, ptr [[MYALLOCA]], align 8
; CHECK-NO-DYNAMIC-NEXT: [[TMP59:%.*]] = add i64 [[TMP10]], 0
; CHECK-NO-DYNAMIC-NEXT: [[TMP60:%.*]] = inttoptr i64 [[TMP59]] to ptr
; CHECK-NO-DYNAMIC-NEXT: store i64 0, ptr [[TMP60]], align 1
diff --git a/llvm/test/Instrumentation/AddressSanitizer/local_stack_base.ll b/llvm/test/Instrumentation/AddressSanitizer/local_stack_base.ll
index afa46e44..8b0940d 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/local_stack_base.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/local_stack_base.ll
@@ -16,9 +16,8 @@ entry:
call void @llvm.dbg.declare(metadata ptr %i.addr, metadata !12, metadata !DIExpression()), !dbg !13
; CHECK: %asan_local_stack_base = alloca i64
- ; CHECK: %[[ALLOCA:.*]] = ptrtoint ptr %MyAlloca to i64
- ; CHECK: %[[PHI:.*]] = phi i64 {{.*}} %[[ALLOCA]],
- ; CHECK: store i64 %[[PHI]], ptr %asan_local_stack_base
+ ; CHECK: %[[PHI:.*]] = phi ptr {{.*}} %MyAlloca
+ ; CHECK: store ptr %[[PHI]], ptr %asan_local_stack_base
; CHECK: #dbg_declare(ptr %asan_local_stack_base, [[VAR_I:![0-9]+]], !DIExpression(DW_OP_deref, DW_OP_plus_uconst, 32), [[LOC_I:![0-9]+]]
%0 = load i32, ptr %i.addr, align 4, !dbg !14
%add = add nsw i32 %0, 2, !dbg !15
diff --git a/llvm/test/Instrumentation/AddressSanitizer/stack-poisoning-byval-args.ll b/llvm/test/Instrumentation/AddressSanitizer/stack-poisoning-byval-args.ll
index d85f217..7e14987 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/stack-poisoning-byval-args.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/stack-poisoning-byval-args.ll
@@ -19,8 +19,7 @@ entry:
; CHECK-LABEL: foo
; CHECK: call i64 @__asan_stack_malloc
; CHECK: alloca i8, i64 {{.*}} align 64
-; CHECK: [[copyPtr:%[^ \t]+]] = inttoptr i64 %{{[^ \t]+}} to ptr
-; CHECK: call void @llvm.memcpy{{[^%]+}}[[copyPtr]]{{[^%]+}} align 64 %a,{{[^,]+}},
+; CHECK: call void @llvm.memcpy{{[^%]+}}[[copyPtr:%[0-9]+]]{{[^%]+}} align 64 %a,{{[^,]+}},
; CHECK: call i32 @bar(ptr [[copyPtr]])
; CHECK: ret void
@@ -38,8 +37,7 @@ entry:
; CHECK-LABEL: baz
; CHECK: call i64 @__asan_stack_malloc
; CHECK: alloca i8, i64 {{.*}} align 32
-; CHECK: [[copyPtr:%[^ \t]+]] = inttoptr i64 %{{[^ \t]+}} to ptr
-; CHECK: call void @llvm.memcpy{{[^%]+}}[[copyPtr]]{{[^%]+}} align 4 %0,{{[^,]+}}
+; CHECK: call void @llvm.memcpy{{[^%]+}}[[copyPtr:%[0-9]+]]{{[^%]+}} align 4 %0,{{[^,]+}}
; CHECK: call i32 @bar(ptr [[copyPtr]])
; CHECK: ret void
diff --git a/llvm/test/Instrumentation/AddressSanitizer/stack_dynamic_alloca.ll b/llvm/test/Instrumentation/AddressSanitizer/stack_dynamic_alloca.ll
index d56cd34..c8478c8 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/stack_dynamic_alloca.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/stack_dynamic_alloca.ll
@@ -19,15 +19,16 @@ entry:
; CHECK-RUNTIME: [[FAKE_STACK_BB:^[0-9]+]]:
; CHECK-RUNTIME: [[FAKE_STACK:%[0-9]+]] = phi i64 [ 0, %entry ], [ [[FAKE_STACK_RT]], %[[UAR_ENABLED_BB]] ]
+; CHECK-RUNTIME: [[FAKE_STACK_PTR:%[0-9]+]] = inttoptr i64 [[FAKE_STACK]] to ptr
+; CHECK-ALWAYS: [[FAKE_STACK_PTR:%[0-9]+]] = inttoptr i64 [[FAKE_STACK_RT]] to ptr
; CHECK-RUNTIME: icmp eq i64 [[FAKE_STACK]], 0
; CHECK-ALWAYS: icmp eq i64 [[FAKE_STACK_RT]], 0
; CHECK: [[NO_FAKE_STACK_BB:^[0-9]+]]:
; CHECK: %MyAlloca = alloca i8, i64
-; CHECK: [[ALLOCA:%[0-9]+]] = ptrtoint ptr %MyAlloca
-; CHECK-RUNTIME: phi i64 [ [[FAKE_STACK]], %[[FAKE_STACK_BB]] ], [ [[ALLOCA]], %[[NO_FAKE_STACK_BB]] ]
-; CHECK-ALWAYS: phi i64 [ [[FAKE_STACK_RT]], %entry ], [ [[ALLOCA]], %[[NO_FAKE_STACK_BB]] ]
+; CHECK-RUNTIME: phi ptr [ [[FAKE_STACK_PTR]], %[[FAKE_STACK_BB]] ], [ %MyAlloca, %[[NO_FAKE_STACK_BB]] ]
+; CHECK-ALWAYS: phi ptr [ [[FAKE_STACK_PTR]], %entry ], [ %MyAlloca, %[[NO_FAKE_STACK_BB]] ]
; CHECK: ret void
diff --git a/llvm/test/LTO/X86/memprof-supports-hot-cold-new.ll b/llvm/test/LTO/X86/memprof-supports-hot-cold-new.ll
index 3ed68e8..c3a75f6 100644
--- a/llvm/test/LTO/X86/memprof-supports-hot-cold-new.ll
+++ b/llvm/test/LTO/X86/memprof-supports-hot-cold-new.ll
@@ -13,14 +13,14 @@
; RUN: -r=%t.o,main,plx \
; RUN: -r=%t.o,_Znam, \
; RUN: -memprof-dump-ccg \
-; RUN: -save-temps \
-; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
-; DUMP: Callsite Context Graph:
+; RUN: -print-before=memprof-context-disambiguation \
+; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR
-; RUN: llvm-dis %t.out.0.0.preopt.bc -o - | FileCheck %s --check-prefix=IR
; IR: !memprof {{.*}} !callsite
; IR: "memprof"="cold"
+; DUMP: Callsite Context Graph:
+
;; Next check without -supports-hot-cold-new, we should not perform
;; context disambiguation, and we should strip memprof metadata and
;; attributes before optimization.
@@ -28,13 +28,16 @@
; RUN: -r=%t.o,main,plx \
; RUN: -r=%t.o,_Znam, \
; RUN: -memprof-dump-ccg \
-; RUN: -save-temps \
+; RUN: -print-before=memprof-context-disambiguation \
; RUN: -o %t.out 2>&1 | FileCheck %s --allow-empty \
-; RUN: --implicit-check-not "Callsite Context Graph:"
+; RUN: --implicit-check-not "Callsite Context Graph:" \
+; RUN: --implicit-check-not "!memprof" --implicit-check-not "!callsite" \
+; RUN: --implicit-check-not "memprof"="cold"
-; RUN: llvm-dis %t.out.0.0.preopt.bc -o - | FileCheck %s \
-; RUN: --implicit-check-not "!memprof" --implicit-check-not "!callsite" \
-; RUN: --implicit-check-not "memprof"="cold"
+;; Ensure the attributes and metadata are stripped when running a non-LTO pipeline.
+; RUN: opt -O3 %t.o -S | FileCheck %s \
+; RUN: --implicit-check-not "!memprof" --implicit-check-not "!callsite" \
+; RUN: --implicit-check-not "memprof"="cold"
source_filename = "memprof-supports-hot-cold-new.ll"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/MC/AArch64/data-directive-specifier.s b/llvm/test/MC/AArch64/data-directive-specifier.s
index 2cb7eb3..2d1ec4f 100644
--- a/llvm/test/MC/AArch64/data-directive-specifier.s
+++ b/llvm/test/MC/AArch64/data-directive-specifier.s
@@ -12,6 +12,7 @@ l:
# CHECK-NEXT: 0x8 R_AARCH64_PLT32 extern 0x4
# CHECK-NEXT: 0xC R_AARCH64_PLT32 g 0x8
# CHECK-NEXT: 0x10 R_AARCH64_PLT32 g 0x18
+# CHECK-NEXT: 0x14 R_AARCH64_FUNCINIT64 .text 0x0
# CHECK-NEXT: }
.data
.word l@plt - .
@@ -21,6 +22,8 @@ l:
.word g@plt - . + 8
.word g@plt - .data + 8
+.quad l@funcinit
+
# CHECK: Section ({{.*}}) .rela.data1 {
# CHECK-NEXT: 0x0 R_AARCH64_GOTPCREL32 data1 0x0
# CHECK-NEXT: 0x4 R_AARCH64_GOTPCREL32 extern 0x4
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_salu_lit64.s b/llvm/test/MC/AMDGPU/gfx1250_asm_salu_lit64.s
index 73653d0..6345b2f 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_salu_lit64.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_salu_lit64.s
@@ -1,6 +1,6 @@
// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
-// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | FileCheck --check-prefixes=GFX1250,GFX1250-ASM %s
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding | FileCheck --check-prefixes=GFX1250,GFX1250-DIS %s
+// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | FileCheck --check-prefixes=GFX1250 %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding | FileCheck --check-prefixes=GFX1250 %s
s_mov_b64 s[2:3], 0x10abcdef12345678
// GFX1250: s_mov_b64 s[2:3], 0x10abcdef12345678 ; encoding: [0xfe,0x01,0x82,0xbe,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
@@ -62,10 +62,8 @@ s_cselect_b64 s[2:3], s[4:5], 0x10abcdef12345678
s_mov_b64 s[2:3], 0xffffffff01234567
// GFX1250: s_mov_b64 s[2:3], 0xffffffff01234567 ; encoding: [0xfe,0x01,0x82,0xbe,0x67,0x45,0x23,0x01,0xff,0xff,0xff,0xff]
-// TODO: disasm
s_mov_b64 s[2:3], lit64(0x777)
-// GFX1250-ASM: s_mov_b64 s[2:3], lit64(0x777) ; encoding: [0xfe,0x01,0x82,0xbe,0x77,0x07,0x00,0x00,0x00,0x00,0x00,0x00]
-// GFX1250-DIS: s_mov_b64 s[2:3], 0x777 ; encoding: [0xff,0x01,0x82,0xbe,0x77,0x07,0x00,0x00]
+// GFX1250: s_mov_b64 s[2:3], lit64(0x777) ; encoding: [0xfe,0x01,0x82,0xbe,0x77,0x07,0x00,0x00,0x00,0x00,0x00,0x00]
s_mov_b64 s[2:3], 0x777
// GFX1250: s_mov_b64 s[2:3], 0x777 ; encoding: [0xff,0x01,0x82,0xbe,0x77,0x07,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vds_alias.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vds_alias.s
index 5b6bb47..83313a2 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vds_alias.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vds_alias.s
@@ -5,3 +5,15 @@ ds_load_tr_b64 v[2:3], v0
ds_load_tr_b128 v[2:5], v0
// GFX1250: ds_load_tr16_b128 v[2:5], v0 ; encoding: [0x00,0x00,0xf0,0xdb,0x00,0x00,0x00,0x02]
+
+ds_load_b128_tr_b16 v[2:5], v0
+// GFX1250: ds_load_tr16_b128 v[2:5], v0 ; encoding: [0x00,0x00,0xf0,0xdb,0x00,0x00,0x00,0x02]
+
+ds_load_b64_tr_b8 v[2:3], v0
+// GFX1250: ds_load_tr8_b64 v[2:3], v0 ; encoding: [0x00,0x00,0xf4,0xdb,0x00,0x00,0x00,0x02]
+
+ds_load_b64_tr_b4 v[2:3], v0
+// GFX1250: ds_load_tr4_b64 v[2:3], v0 ; encoding: [0x00,0x00,0xe8,0xdb,0x00,0x00,0x00,0x02]
+
+ds_load_tr6_b96 v[2:4], v0
+// GFX1250: ds_load_tr6_b96 v[2:4], v0 ; encoding: [0x00,0x00,0xec,0xdb,0x00,0x00,0x00,0x02]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vflat_alias.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vflat_alias.s
index 6b2dd67..f983bc0 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vflat_alias.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vflat_alias.s
@@ -35,3 +35,78 @@ global_load_tr_b128 v[2:5], v[6:7], off offset:64
global_load_tr_b128 v[2:5], v[6:7], off offset:-64
// GFX1250: global_load_tr16_b128 v[2:5], v[6:7], off offset:-64 ; encoding: [0x7c,0xc0,0x15,0xee,0x02,0x00,0x00,0x00,0x06,0xc0,0xff,0xff]
+
+global_load_b64_tr_b8 v[2:3], v0, s[0:1]
+// GFX1250: global_load_tr8_b64 v[2:3], v0, s[0:1] ; encoding: [0x00,0x00,0x16,0xee,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+
+global_load_b64_tr_b8 v[2:3], v0, s[0:1] offset:64
+// GFX1250: global_load_tr8_b64 v[2:3], v0, s[0:1] offset:64 ; encoding: [0x00,0x00,0x16,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+
+global_load_b64_tr_b8 v[2:3], v0, s[0:1] offset:-64
+// GFX1250: global_load_tr8_b64 v[2:3], v0, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x16,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+
+global_load_b64_tr_b8 v[2:3], v[4:5], off
+// GFX1250: global_load_tr8_b64 v[2:3], v[4:5], off ; encoding: [0x7c,0x00,0x16,0xee,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+
+global_load_b64_tr_b8 v[2:3], v[4:5], off offset:64
+// GFX1250: global_load_tr8_b64 v[2:3], v[4:5], off offset:64 ; encoding: [0x7c,0x00,0x16,0xee,0x02,0x00,0x00,0x00,0x04,0x40,0x00,0x00]
+
+global_load_b64_tr_b8 v[2:3], v[4:5], off offset:-64
+// GFX1250: global_load_tr8_b64 v[2:3], v[4:5], off offset:-64 ; encoding: [0x7c,0x00,0x16,0xee,0x02,0x00,0x00,0x00,0x04,0xc0,0xff,0xff]
+
+global_load_b128_tr_b16 v[2:5], v0, s[0:1]
+// GFX1250: global_load_tr16_b128 v[2:5], v0, s[0:1] ; encoding: [0x00,0xc0,0x15,0xee,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+
+global_load_b128_tr_b16 v[2:5], v0, s[0:1] offset:64
+// GFX1250: global_load_tr16_b128 v[2:5], v0, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x15,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+
+global_load_b128_tr_b16 v[2:5], v0, s[0:1] offset:-64
+// GFX1250: global_load_tr16_b128 v[2:5], v0, s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x15,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+
+global_load_b128_tr_b16 v[2:5], v[6:7], off
+// GFX1250: global_load_tr16_b128 v[2:5], v[6:7], off ; encoding: [0x7c,0xc0,0x15,0xee,0x02,0x00,0x00,0x00,0x06,0x00,0x00,0x00]
+
+global_load_b128_tr_b16 v[2:5], v[6:7], off offset:64
+// GFX1250: global_load_tr16_b128 v[2:5], v[6:7], off offset:64 ; encoding: [0x7c,0xc0,0x15,0xee,0x02,0x00,0x00,0x00,0x06,0x40,0x00,0x00]
+
+global_load_b128_tr_b16 v[2:5], v[6:7], off offset:-64
+// GFX1250: global_load_tr16_b128 v[2:5], v[6:7], off offset:-64 ; encoding: [0x7c,0xc0,0x15,0xee,0x02,0x00,0x00,0x00,0x06,0xc0,0xff,0xff]
+
+global_load_b64_tr_b4 v[2:3], v0, s[0:1]
+// GFX1250: global_load_tr4_b64 v[2:3], v0, s[0:1] ; encoding: [0x00,0xc0,0x1c,0xee,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+
+global_load_b64_tr_b4 v[2:3], v0, s[0:1] offset:64
+// GFX1250: global_load_tr4_b64 v[2:3], v0, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x1c,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+
+global_load_b64_tr_b4 v[2:3], v0, s[0:1] offset:-64
+// GFX1250: global_load_tr4_b64 v[2:3], v0, s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x1c,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+
+global_load_b64_tr_b4 v[2:3], v[4:5], off
+// GFX1250: global_load_tr4_b64 v[2:3], v[4:5], off ; encoding: [0x7c,0xc0,0x1c,0xee,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+
+global_load_b64_tr_b4 v[2:3], v[4:5], off offset:64
+// GFX1250: global_load_tr4_b64 v[2:3], v[4:5], off offset:64 ; encoding: [0x7c,0xc0,0x1c,0xee,0x02,0x00,0x00,0x00,0x04,0x40,0x00,0x00]
+
+global_load_b64_tr_b4 v[2:3], v[4:5], off offset:-64
+// GFX1250: global_load_tr4_b64 v[2:3], v[4:5], off offset:-64 ; encoding: [0x7c,0xc0,0x1c,0xee,0x02,0x00,0x00,0x00,0x04,0xc0,0xff,0xff]
+
+global_load_b96_tr_b6 v[2:4], v0, s[0:1]
+// GFX1250: global_load_tr6_b96 v[2:4], v0, s[0:1] ; encoding: [0x00,0x00,0x1d,0xee,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+
+global_load_b96_tr_b6 v[3:5], v0, s[0:1]
+// GFX1250: global_load_tr6_b96 v[3:5], v0, s[0:1] ; encoding: [0x00,0x00,0x1d,0xee,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+
+global_load_b96_tr_b6 v[2:4], v0, s[0:1] offset:64
+// GFX1250: global_load_tr6_b96 v[2:4], v0, s[0:1] offset:64 ; encoding: [0x00,0x00,0x1d,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+
+global_load_b96_tr_b6 v[2:4], v0, s[0:1] offset:-64
+// GFX1250: global_load_tr6_b96 v[2:4], v0, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x1d,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+
+global_load_b96_tr_b6 v[2:4], v[6:7], off
+// GFX1250: global_load_tr6_b96 v[2:4], v[6:7], off ; encoding: [0x7c,0x00,0x1d,0xee,0x02,0x00,0x00,0x00,0x06,0x00,0x00,0x00]
+
+global_load_b96_tr_b6 v[2:4], v[6:7], off offset:64
+// GFX1250: global_load_tr6_b96 v[2:4], v[6:7], off offset:64 ; encoding: [0x7c,0x00,0x1d,0xee,0x02,0x00,0x00,0x00,0x06,0x40,0x00,0x00]
+
+global_load_b96_tr_b6 v[2:4], v[6:7], off offset:-64
+// GFX1250: global_load_tr6_b96 v[2:4], v[6:7], off offset:-64 ; encoding: [0x7c,0x00,0x1d,0xee,0x02,0x00,0x00,0x00,0x06,0xc0,0xff,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s
index 0d61c1f..39de9a2 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s
@@ -1,6 +1,6 @@
// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding %s | FileCheck --check-prefix=GFX1250 %s
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding | FileCheck --check-prefixes=GFX1250 %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX1250,GFX1250-ASM %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding | FileCheck --check-prefixes=GFX1250,GFX1250-DIS %s
v_mov_b64_e32 v[4:5], v[2:3]
// GFX1250: v_mov_b64_e32 v[4:5], v[2:3] ; encoding: [0x02,0x3b,0x08,0x7e]
@@ -26,8 +26,10 @@ v_mov_b64 v[4:5], -1
v_mov_b64 v[4:5], 0.5
// GFX1250: v_mov_b64_e32 v[4:5], 0.5 ; encoding: [0xf0,0x3a,0x08,0x7e]
+// TODO: Encode as a 32-bit literal unless lit64() is specified.
v_mov_b64 v[254:255], 0xaf123456
-// GFX1250: v_mov_b64_e32 v[254:255], 0xaf123456 ; encoding: [0xfe,0x3a,0xfc,0x7f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: v_mov_b64_e32 v[254:255], 0xaf123456 ; encoding: [0xfe,0x3a,0xfc,0x7f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: v_mov_b64_e32 v[254:255], lit64(0xaf123456) ; encoding: [0xfe,0x3a,0xfc,0x7f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
v_tanh_f32 v5, v1
// GFX1250: v_tanh_f32_e32 v5, v1 ; encoding: [0x01,0x3d,0x0a,0x7e]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s
index 02872b0..d9f6934 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s
@@ -196,8 +196,9 @@ v_add_nc_u64 v[4:5], -4.0, v[4:5]
// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
v_add_nc_u64 v[4:5], 0xaf123456, v[4:5]
-// GFX1250: v_add_nc_u64_e32 v[4:5], 0xaf123456, v[4:5] ; encoding: [0xfe,0x08,0x08,0x50,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
-// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// GFX1250-ASM: v_add_nc_u64_e32 v[4:5], 0xaf123456, v[4:5] ; encoding: [0xfe,0x08,0x08,0x50,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: v_add_nc_u64_e32 v[4:5], lit64(0xaf123456), v[4:5] ; encoding: [0xfe,0x08,0x08,0x50,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_add_nc_u64 v[4:5], 0x3f717273, v[4:5]
// GFX1250: v_add_nc_u64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x50,0x73,0x72,0x71,0x3f]
@@ -316,8 +317,9 @@ v_sub_nc_u64 v[4:5], -4.0, v[4:5]
// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
v_sub_nc_u64 v[4:5], 0xaf123456, v[4:5]
-// GFX1250: v_sub_nc_u64_e32 v[4:5], 0xaf123456, v[4:5] ; encoding: [0xfe,0x08,0x08,0x52,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
-// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// GFX1250-ASM: v_sub_nc_u64_e32 v[4:5], 0xaf123456, v[4:5] ; encoding: [0xfe,0x08,0x08,0x52,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: v_sub_nc_u64_e32 v[4:5], lit64(0xaf123456), v[4:5] ; encoding: [0xfe,0x08,0x08,0x52,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_sub_nc_u64 v[4:5], 0x3f717273, v[4:5]
// GFX1250: v_sub_nc_u64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x52,0x73,0x72,0x71,0x3f]
@@ -436,8 +438,9 @@ v_mul_u64 v[4:5], -4.0, v[4:5]
// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
v_mul_u64 v[4:5], 0xaf123456, v[4:5]
-// GFX1250: v_mul_u64_e32 v[4:5], 0xaf123456, v[4:5] ; encoding: [0xfe,0x08,0x08,0x54,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
-// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// GFX1250-ASM: v_mul_u64_e32 v[4:5], 0xaf123456, v[4:5] ; encoding: [0xfe,0x08,0x08,0x54,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: v_mul_u64_e32 v[4:5], lit64(0xaf123456), v[4:5] ; encoding: [0xfe,0x08,0x08,0x54,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
v_mul_u64 v[4:5], 0x3f717273, v[4:5]
// GFX1250: v_mul_u64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x54,0x73,0x72,0x71,0x3f]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s
index ad5771b..0548e9d 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s
@@ -1,7 +1,7 @@
// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1200 %s | FileCheck --check-prefixes=GFX12,GFX1200 %s
-// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | FileCheck --check-prefixes=GFX12,GFX1250 %s
-// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding | FileCheck --check-prefixes=GFX1250 %s
+// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | FileCheck --check-prefixes=GFX12,GFX1250,GFX1250-ASM %s
+// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding | FileCheck --check-prefixes=GFX1250,GFX1250-DIS %s
s_alloc_vgpr 0x1235
// GFX12: s_alloc_vgpr 0x1235 ; encoding: [0xff,0x53,0x80,0xbe,0x35,0x12,0x00,0x00]
@@ -860,7 +860,8 @@ s_mov_b64 s[0:1], 0x3f717273
s_mov_b64 s[0:1], 0xaf123456
// GFX1200: s_mov_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x01,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_mov_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x01,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_mov_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x01,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_mov_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x01,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_mov_b64 s[0:1], null
// GFX12: s_mov_b64 s[0:1], null ; encoding: [0x7c,0x01,0x80,0xbe]
@@ -969,7 +970,8 @@ s_cmov_b64 s[0:1], 0x3f717273
s_cmov_b64 s[0:1], 0xaf123456
// GFX1200: s_cmov_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x03,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_cmov_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x03,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_cmov_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x03,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_cmov_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x03,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_not_b32 s0, s1
// GFX12: s_not_b32 s0, s1 ; encoding: [0x01,0x1e,0x80,0xbe]
@@ -1072,7 +1074,8 @@ s_not_b64 s[0:1], 0x3f717273
s_not_b64 s[0:1], 0xaf123456
// GFX1200: s_not_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x1f,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_not_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x1f,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_not_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x1f,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_not_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x1f,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_wqm_b32 s0, s1
// GFX12: s_wqm_b32 s0, s1 ; encoding: [0x01,0x1c,0x80,0xbe]
@@ -1175,7 +1178,8 @@ s_wqm_b64 s[0:1], 0x3f717273
s_wqm_b64 s[0:1], 0xaf123456
// GFX1200: s_wqm_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x1d,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_wqm_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x1d,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_wqm_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x1d,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_wqm_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x1d,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_brev_b32 s0, s1
// GFX12: s_brev_b32 s0, s1 ; encoding: [0x01,0x04,0x80,0xbe]
@@ -1278,7 +1282,8 @@ s_brev_b64 s[0:1], 0x3f717273
s_brev_b64 s[0:1], 0xaf123456
// GFX1200: s_brev_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x05,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_brev_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x05,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_brev_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x05,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_brev_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x05,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_bcnt0_i32_b32 s0, s1
// GFX12: s_bcnt0_i32_b32 s0, s1 ; encoding: [0x01,0x16,0x80,0xbe]
@@ -1390,7 +1395,8 @@ s_bcnt0_i32_b64 s0, 0x3f717273
s_bcnt0_i32_b64 s0, 0xaf123456
// GFX1200: s_bcnt0_i32_b64 s0, 0xaf123456 ; encoding: [0xff,0x17,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_bcnt0_i32_b64 s0, 0xaf123456 ; encoding: [0xfe,0x17,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_bcnt0_i32_b64 s0, 0xaf123456 ; encoding: [0xfe,0x17,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_bcnt0_i32_b64 s0, lit64(0xaf123456) ; encoding: [0xfe,0x17,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_bcnt1_i32_b32 s0, s1
// GFX12: s_bcnt1_i32_b32 s0, s1 ; encoding: [0x01,0x18,0x80,0xbe]
@@ -1502,7 +1508,8 @@ s_bcnt1_i32_b64 s0, 0x3f717273
s_bcnt1_i32_b64 s0, 0xaf123456
// GFX1200: s_bcnt1_i32_b64 s0, 0xaf123456 ; encoding: [0xff,0x19,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_bcnt1_i32_b64 s0, 0xaf123456 ; encoding: [0xfe,0x19,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_bcnt1_i32_b64 s0, 0xaf123456 ; encoding: [0xfe,0x19,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_bcnt1_i32_b64 s0, lit64(0xaf123456) ; encoding: [0xfe,0x19,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_ff1_i32_b32 s0, s1
// GFX12: s_ctz_i32_b32 s0, s1 ; encoding: [0x01,0x08,0x80,0xbe]
@@ -1614,7 +1621,8 @@ s_ff1_i32_b64 s0, 0x3f717273
s_ff1_i32_b64 s0, 0xaf123456
// GFX1200: s_ctz_i32_b64 s0, 0xaf123456 ; encoding: [0xff,0x09,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_ctz_i32_b64 s0, 0xaf123456 ; encoding: [0xfe,0x09,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_ctz_i32_b64 s0, 0xaf123456 ; encoding: [0xfe,0x09,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_ctz_i32_b64 s0, lit64(0xaf123456) ; encoding: [0xfe,0x09,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_flbit_i32_b32 s0, s1
// GFX12: s_clz_i32_u32 s0, s1 ; encoding: [0x01,0x0a,0x80,0xbe]
@@ -1726,7 +1734,8 @@ s_flbit_i32_b64 s0, 0x3f717273
s_flbit_i32_b64 s0, 0xaf123456
// GFX1200: s_clz_i32_u64 s0, 0xaf123456 ; encoding: [0xff,0x0b,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_clz_i32_u64 s0, 0xaf123456 ; encoding: [0xfe,0x0b,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_clz_i32_u64 s0, 0xaf123456 ; encoding: [0xfe,0x0b,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_clz_i32_u64 s0, lit64(0xaf123456) ; encoding: [0xfe,0x0b,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_flbit_i32 s0, s1
// GFX12: s_cls_i32 s0, s1 ; encoding: [0x01,0x0c,0x80,0xbe]
@@ -1838,7 +1847,8 @@ s_flbit_i32_i64 s0, 0x3f717273
s_flbit_i32_i64 s0, 0xaf123456
// GFX1200: s_cls_i32_i64 s0, 0xaf123456 ; encoding: [0xff,0x0d,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_cls_i32_i64 s0, 0xaf123456 ; encoding: [0xfe,0x0d,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_cls_i32_i64 s0, 0xaf123456 ; encoding: [0xfe,0x0d,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_cls_i32_i64 s0, lit64(0xaf123456) ; encoding: [0xfe,0x0d,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_sext_i32_i8 s0, s1
// GFX12: s_sext_i32_i8 s0, s1 ; encoding: [0x01,0x0e,0x80,0xbe]
@@ -2284,7 +2294,8 @@ s_and_saveexec_b64 s[0:1], 0x3f717273
s_and_saveexec_b64 s[0:1], 0xaf123456
// GFX1200: s_and_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x21,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x21,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x21,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x21,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_or_saveexec_b64 s[0:1], s[2:3]
// GFX12: s_or_saveexec_b64 s[0:1], s[2:3] ; encoding: [0x02,0x23,0x80,0xbe]
@@ -2324,7 +2335,8 @@ s_or_saveexec_b64 s[0:1], 0x3f717273
s_or_saveexec_b64 s[0:1], 0xaf123456
// GFX1200: s_or_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x23,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_or_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x23,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_or_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x23,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_or_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x23,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_xor_saveexec_b64 s[0:1], s[2:3]
// GFX12: s_xor_saveexec_b64 s[0:1], s[2:3] ; encoding: [0x02,0x25,0x80,0xbe]
@@ -2364,7 +2376,8 @@ s_xor_saveexec_b64 s[0:1], 0x3f717273
s_xor_saveexec_b64 s[0:1], 0xaf123456
// GFX1200: s_xor_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x25,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_xor_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x25,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_xor_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x25,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_xor_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x25,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_andn2_saveexec_b64 s[0:1], s[2:3]
// GFX12: s_and_not1_saveexec_b64 s[0:1], s[2:3] ; encoding: [0x02,0x31,0x80,0xbe]
@@ -2404,7 +2417,8 @@ s_andn2_saveexec_b64 s[0:1], 0x3f717273
s_andn2_saveexec_b64 s[0:1], 0xaf123456
// GFX1200: s_and_not1_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x31,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_not1_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x31,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_not1_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x31,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_not1_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x31,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_orn2_saveexec_b64 s[0:1], s[2:3]
// GFX12: s_or_not1_saveexec_b64 s[0:1], s[2:3] ; encoding: [0x02,0x33,0x80,0xbe]
@@ -2444,7 +2458,8 @@ s_orn2_saveexec_b64 s[0:1], 0x3f717273
s_orn2_saveexec_b64 s[0:1], 0xaf123456
// GFX1200: s_or_not1_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x33,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_or_not1_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x33,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_or_not1_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x33,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_or_not1_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x33,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_nand_saveexec_b64 s[0:1], s[2:3]
// GFX12: s_nand_saveexec_b64 s[0:1], s[2:3] ; encoding: [0x02,0x27,0x80,0xbe]
@@ -2484,7 +2499,8 @@ s_nand_saveexec_b64 s[0:1], 0x3f717273
s_nand_saveexec_b64 s[0:1], 0xaf123456
// GFX1200: s_nand_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x27,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_nand_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x27,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_nand_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x27,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_nand_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x27,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_nor_saveexec_b64 s[0:1], s[2:3]
// GFX12: s_nor_saveexec_b64 s[0:1], s[2:3] ; encoding: [0x02,0x29,0x80,0xbe]
@@ -2524,7 +2540,8 @@ s_nor_saveexec_b64 s[0:1], 0x3f717273
s_nor_saveexec_b64 s[0:1], 0xaf123456
// GFX1200: s_nor_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x29,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_nor_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x29,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_nor_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x29,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_nor_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x29,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_xnor_saveexec_b64 s[0:1], s[2:3]
// GFX12: s_xnor_saveexec_b64 s[0:1], s[2:3] ; encoding: [0x02,0x2b,0x80,0xbe]
@@ -2564,7 +2581,8 @@ s_xnor_saveexec_b64 s[0:1], 0x3f717273
s_xnor_saveexec_b64 s[0:1], 0xaf123456
// GFX1200: s_xnor_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x2b,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_xnor_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x2b,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_xnor_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x2b,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_xnor_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x2b,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_quadmask_b32 s0, s1
// GFX12: s_quadmask_b32 s0, s1 ; encoding: [0x01,0x1a,0x80,0xbe]
@@ -2667,7 +2685,8 @@ s_quadmask_b64 s[0:1], 0x3f717273
s_quadmask_b64 s[0:1], 0xaf123456
// GFX1200: s_quadmask_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x1b,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_quadmask_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x1b,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_quadmask_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x1b,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_quadmask_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x1b,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_movrels_b32 s0, s1
// GFX12: s_movrels_b32 s0, s1 ; encoding: [0x01,0x40,0x80,0xbe]
@@ -2812,7 +2831,8 @@ s_movreld_b64 s[0:1], 0x3f717273
s_movreld_b64 s[0:1], 0xaf123456
// GFX1200: s_movreld_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x43,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_movreld_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x43,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_movreld_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x43,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_movreld_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x43,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_abs_i32 s0, s1
// GFX12: s_abs_i32 s0, s1 ; encoding: [0x01,0x15,0x80,0xbe]
@@ -2912,7 +2932,8 @@ s_andn1_saveexec_b64 s[0:1], 0x3f717273
s_andn1_saveexec_b64 s[0:1], 0xaf123456
// GFX1200: s_and_not0_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x2d,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_not0_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x2d,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_not0_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x2d,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_not0_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x2d,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_orn1_saveexec_b64 s[0:1], s[2:3]
// GFX12: s_or_not0_saveexec_b64 s[0:1], s[2:3] ; encoding: [0x02,0x2f,0x80,0xbe]
@@ -2952,7 +2973,8 @@ s_orn1_saveexec_b64 s[0:1], 0x3f717273
s_orn1_saveexec_b64 s[0:1], 0xaf123456
// GFX1200: s_or_not0_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x2f,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_or_not0_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x2f,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_or_not0_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x2f,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_or_not0_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x2f,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_andn1_wrexec_b64 s[0:1], s[2:3]
// GFX12: s_and_not0_wrexec_b64 s[0:1], s[2:3] ; encoding: [0x02,0x35,0x80,0xbe]
@@ -2992,7 +3014,8 @@ s_andn1_wrexec_b64 s[0:1], 0x3f717273
s_andn1_wrexec_b64 s[0:1], 0xaf123456
// GFX1200: s_and_not0_wrexec_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x35,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_not0_wrexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x35,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_not0_wrexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x35,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_not0_wrexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x35,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_andn2_wrexec_b64 s[0:1], s[2:3]
// GFX12: s_and_not1_wrexec_b64 s[0:1], s[2:3] ; encoding: [0x02,0x37,0x80,0xbe]
@@ -3032,7 +3055,8 @@ s_andn2_wrexec_b64 s[0:1], 0x3f717273
s_andn2_wrexec_b64 s[0:1], 0xaf123456
// GFX1200: s_and_not1_wrexec_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x37,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_not1_wrexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x37,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_not1_wrexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x37,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_not1_wrexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x37,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_bitreplicate_b64_b32 s[0:1], s2
// GFX12: s_bitreplicate_b64_b32 s[0:1], s2 ; encoding: [0x02,0x14,0x80,0xbe]
@@ -3831,7 +3855,8 @@ s_ctz_i32_b64 exec_hi, src_scc
s_ctz_i32_b64 null, 0xaf123456
// GFX1200: s_ctz_i32_b64 null, 0xaf123456 ; encoding: [0xff,0x09,0xfc,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_ctz_i32_b64 null, 0xaf123456 ; encoding: [0xfe,0x09,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_ctz_i32_b64 null, 0xaf123456 ; encoding: [0xfe,0x09,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_ctz_i32_b64 null, lit64(0xaf123456) ; encoding: [0xfe,0x09,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_and_not1_saveexec_b64 s[10:11], s[2:3]
// GFX12: s_and_not1_saveexec_b64 s[10:11], s[2:3] ; encoding: [0x02,0x31,0x8a,0xbe]
@@ -3859,7 +3884,8 @@ s_and_not1_saveexec_b64 ttmp[14:15], src_scc
s_and_not1_saveexec_b64 null, 0xaf123456
// GFX1200: s_and_not1_saveexec_b64 null, 0xaf123456 ; encoding: [0xff,0x31,0xfc,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_not1_saveexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x31,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_not1_saveexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x31,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_not1_saveexec_b64 null, lit64(0xaf123456) ; encoding: [0xfe,0x31,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_and_not0_saveexec_b32 s5, s1
// GFX12: s_and_not0_saveexec_b32 s5, s1 ; encoding: [0x01,0x2c,0x85,0xbe]
@@ -3920,7 +3946,8 @@ s_and_not0_saveexec_b64 ttmp[14:15], src_scc
s_and_not0_saveexec_b64 null, 0xaf123456
// GFX1200: s_and_not0_saveexec_b64 null, 0xaf123456 ; encoding: [0xff,0x2d,0xfc,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_not0_saveexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x2d,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_not0_saveexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x2d,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_not0_saveexec_b64 null, lit64(0xaf123456) ; encoding: [0xfe,0x2d,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_and_not0_wrexec_b32 s5, s1
// GFX12: s_and_not0_wrexec_b32 s5, s1 ; encoding: [0x01,0x34,0x85,0xbe]
@@ -3981,7 +4008,8 @@ s_and_not0_wrexec_b64 ttmp[14:15], src_scc
s_and_not0_wrexec_b64 null, 0xaf123456
// GFX1200: s_and_not0_wrexec_b64 null, 0xaf123456 ; encoding: [0xff,0x35,0xfc,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_not0_wrexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x35,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_not0_wrexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x35,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_not0_wrexec_b64 null, lit64(0xaf123456) ; encoding: [0xfe,0x35,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_and_not1_saveexec_b32 s5, s1
// GFX12: s_and_not1_saveexec_b32 s5, s1 ; encoding: [0x01,0x30,0x85,0xbe]
@@ -4075,7 +4103,8 @@ s_and_not1_wrexec_b64 ttmp[14:15], src_scc
s_and_not1_wrexec_b64 null, 0xaf123456
// GFX1200: s_and_not1_wrexec_b64 null, 0xaf123456 ; encoding: [0xff,0x37,0xfc,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_not1_wrexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x37,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_not1_wrexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x37,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_not1_wrexec_b64 null, lit64(0xaf123456) ; encoding: [0xfe,0x37,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_cls_i32 s5, s1
// GFX12: s_cls_i32 s5, s1 ; encoding: [0x01,0x0c,0x85,0xbe]
@@ -4145,7 +4174,8 @@ s_cls_i32_i64 exec_hi, src_scc
s_cls_i32_i64 null, 0xaf123456
// GFX1200: s_cls_i32_i64 null, 0xaf123456 ; encoding: [0xff,0x0d,0xfc,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_cls_i32_i64 null, 0xaf123456 ; encoding: [0xfe,0x0d,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_cls_i32_i64 null, 0xaf123456 ; encoding: [0xfe,0x0d,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_cls_i32_i64 null, lit64(0xaf123456) ; encoding: [0xfe,0x0d,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_clz_i32_u32 s5, s1
// GFX12: s_clz_i32_u32 s5, s1 ; encoding: [0x01,0x0a,0x85,0xbe]
@@ -4215,7 +4245,8 @@ s_clz_i32_u64 exec_hi, src_scc
s_clz_i32_u64 null, 0xaf123456
// GFX1200: s_clz_i32_u64 null, 0xaf123456 ; encoding: [0xff,0x0b,0xfc,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_clz_i32_u64 null, 0xaf123456 ; encoding: [0xfe,0x0b,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_clz_i32_u64 null, 0xaf123456 ; encoding: [0xfe,0x0b,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_clz_i32_u64 null, lit64(0xaf123456) ; encoding: [0xfe,0x0b,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_or_not0_saveexec_b32 s5, s1
// GFX12: s_or_not0_saveexec_b32 s5, s1 ; encoding: [0x01,0x2e,0x85,0xbe]
@@ -4276,7 +4307,8 @@ s_or_not0_saveexec_b64 ttmp[14:15], src_scc
s_or_not0_saveexec_b64 null, 0xaf123456
// GFX1200: s_or_not0_saveexec_b64 null, 0xaf123456 ; encoding: [0xff,0x2f,0xfc,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_or_not0_saveexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x2f,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_or_not0_saveexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x2f,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_or_not0_saveexec_b64 null, lit64(0xaf123456) ; encoding: [0xfe,0x2f,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_or_not1_saveexec_b32 s5, s1
// GFX12: s_or_not1_saveexec_b32 s5, s1 ; encoding: [0x01,0x32,0x85,0xbe]
@@ -4337,4 +4369,5 @@ s_or_not1_saveexec_b64 ttmp[14:15], src_scc
s_or_not1_saveexec_b64 null, 0xaf123456
// GFX1200: s_or_not1_saveexec_b64 null, 0xaf123456 ; encoding: [0xff,0x33,0xfc,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_or_not1_saveexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x33,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_or_not1_saveexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x33,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_or_not1_saveexec_b64 null, lit64(0xaf123456) ; encoding: [0xfe,0x33,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_sop2.s b/llvm/test/MC/AMDGPU/gfx12_asm_sop2.s
index 9c83879..3a24442 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_sop2.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_sop2.s
@@ -1,7 +1,7 @@
// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1200 %s | FileCheck --check-prefixes=GFX12,GFX1200 %s
-// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | FileCheck --check-prefixes=GFX12,GFX1250 %s
-// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding | FileCheck --check-prefixes=GFX1250 %s
+// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | FileCheck --check-prefixes=GFX12,GFX1250-ASM %s
+// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding | FileCheck --check-prefixes=GFX12,GFX1250-DIS %s
s_add_nc_u64 s[0:1], s[2:3], s[4:5]
// GFX12: s_add_nc_u64 s[0:1], s[2:3], s[4:5] ; encoding: [0x02,0x04,0x80,0xa9]
@@ -56,7 +56,8 @@ s_add_nc_u64 s[0:1], 0x3f717273, s[2:3]
s_add_nc_u64 s[0:1], 0xaf123456, s[2:3]
// GFX1200: s_add_nc_u64 s[0:1], 0xaf123456, s[2:3] ; encoding: [0xff,0x02,0x80,0xa9,0x56,0x34,0x12,0xaf]
-// GFX1250: s_add_nc_u64 s[0:1], 0xaf123456, s[2:3] ; encoding: [0xfe,0x02,0x80,0xa9,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_add_nc_u64 s[0:1], 0xaf123456, s[2:3] ; encoding: [0xfe,0x02,0x80,0xa9,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_add_nc_u64 s[0:1], lit64(0xaf123456), s[2:3] ; encoding: [0xfe,0x02,0x80,0xa9,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_add_nc_u64 s[0:1], s[2:3], exec
// GFX12: s_add_nc_u64 s[0:1], s[2:3], exec ; encoding: [0x02,0x7e,0x80,0xa9]
@@ -81,7 +82,8 @@ s_add_nc_u64 s[0:1], s[2:3], 0x3f717273
s_add_nc_u64 s[0:1], s[2:3], 0xaf123456
// GFX1200: s_add_nc_u64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xff,0x80,0xa9,0x56,0x34,0x12,0xaf]
-// GFX1250: s_add_nc_u64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0xa9,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_add_nc_u64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0xa9,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_add_nc_u64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0xa9,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_sub_nc_u64 s[0:1], s[2:3], s[4:5]
// GFX12: s_sub_nc_u64 s[0:1], s[2:3], s[4:5] ; encoding: [0x02,0x04,0x00,0xaa]
@@ -136,7 +138,8 @@ s_sub_nc_u64 s[0:1], 0x3f717273, s[2:3]
s_sub_nc_u64 s[0:1], 0xaf123456, s[2:3]
// GFX1200: s_sub_nc_u64 s[0:1], 0xaf123456, s[2:3] ; encoding: [0xff,0x02,0x00,0xaa,0x56,0x34,0x12,0xaf]
-// GFX1250: s_sub_nc_u64 s[0:1], 0xaf123456, s[2:3] ; encoding: [0xfe,0x02,0x00,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_sub_nc_u64 s[0:1], 0xaf123456, s[2:3] ; encoding: [0xfe,0x02,0x00,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_sub_nc_u64 s[0:1], lit64(0xaf123456), s[2:3] ; encoding: [0xfe,0x02,0x00,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_sub_nc_u64 s[0:1], s[2:3], exec
// GFX12: s_sub_nc_u64 s[0:1], s[2:3], exec ; encoding: [0x02,0x7e,0x00,0xaa]
@@ -161,7 +164,8 @@ s_sub_nc_u64 s[0:1], s[2:3], 0x3f717273
s_sub_nc_u64 s[0:1], s[2:3], 0xaf123456
// GFX1200: s_sub_nc_u64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xff,0x00,0xaa,0x56,0x34,0x12,0xaf]
-// GFX1250: s_sub_nc_u64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x00,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_sub_nc_u64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x00,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_sub_nc_u64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x00,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_mul_u64 s[0:1], s[2:3], s[4:5]
// GFX12: s_mul_u64 s[0:1], s[2:3], s[4:5] ; encoding: [0x02,0x04,0x80,0xaa]
@@ -216,7 +220,8 @@ s_mul_u64 s[0:1], 0x3f717273, s[2:3]
s_mul_u64 s[0:1], 0xaf123456, s[2:3]
// GFX1200: s_mul_u64 s[0:1], 0xaf123456, s[2:3] ; encoding: [0xff,0x02,0x80,0xaa,0x56,0x34,0x12,0xaf]
-// GFX1250: s_mul_u64 s[0:1], 0xaf123456, s[2:3] ; encoding: [0xfe,0x02,0x80,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_mul_u64 s[0:1], 0xaf123456, s[2:3] ; encoding: [0xfe,0x02,0x80,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_mul_u64 s[0:1], lit64(0xaf123456), s[2:3] ; encoding: [0xfe,0x02,0x80,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_mul_u64 s[0:1], s[2:3], exec
// GFX12: s_mul_u64 s[0:1], s[2:3], exec ; encoding: [0x02,0x7e,0x80,0xaa]
@@ -241,7 +246,8 @@ s_mul_u64 s[0:1], s[2:3], 0x3f717273
s_mul_u64 s[0:1], s[2:3], 0xaf123456
// GFX1200: s_mul_u64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xff,0x80,0xaa,0x56,0x34,0x12,0xaf]
-// GFX1250: s_mul_u64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_mul_u64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_mul_u64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_add_f32 s5, s1, s2
// GFX12: s_add_f32 s5, s1, s2 ; encoding: [0x01,0x02,0x05,0xa0]
@@ -2359,7 +2365,8 @@ s_cselect_b64 s[0:1], 0x3f717273, s[4:5]
s_cselect_b64 s[0:1], 0xaf123456, s[4:5]
// GFX1200: s_cselect_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xff,0x04,0x80,0x98,0x56,0x34,0x12,0xaf]
-// GFX1250: s_cselect_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x98,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_cselect_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x98,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_cselect_b64 s[0:1], lit64(0xaf123456), s[4:5] ; encoding: [0xfe,0x04,0x80,0x98,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_cselect_b64 s[0:1], s[2:3], exec
// GFX12: s_cselect_b64 s[0:1], s[2:3], exec ; encoding: [0x02,0x7e,0x80,0x98]
@@ -2384,7 +2391,8 @@ s_cselect_b64 s[0:1], s[2:3], 0x3f717273
s_cselect_b64 s[0:1], s[2:3], 0xaf123456
// GFX1200: s_cselect_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xff,0x80,0x98,0x56,0x34,0x12,0xaf]
-// GFX1250: s_cselect_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x98,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_cselect_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x98,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_cselect_b64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0x98,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_and_b32 s0, s1, s2
// GFX12: s_and_b32 s0, s1, s2 ; encoding: [0x01,0x02,0x00,0x8b]
@@ -2553,7 +2561,8 @@ s_and_b64 s[0:1], 0x3f717273, s[4:5]
s_and_b64 s[0:1], 0xaf123456, s[4:5]
// GFX1200: s_and_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xff,0x04,0x80,0x8b,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x8b,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x8b,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_b64 s[0:1], lit64(0xaf123456), s[4:5] ; encoding: [0xfe,0x04,0x80,0x8b,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_and_b64 s[0:1], s[2:3], exec
// GFX12: s_and_b64 s[0:1], s[2:3], exec ; encoding: [0x02,0x7e,0x80,0x8b]
@@ -2578,7 +2587,8 @@ s_and_b64 s[0:1], s[2:3], 0x3f717273
s_and_b64 s[0:1], s[2:3], 0xaf123456
// GFX1200: s_and_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xff,0x80,0x8b,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x8b,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x8b,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_b64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0x8b,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_or_b32 s0, s1, s2
// GFX12: s_or_b32 s0, s1, s2 ; encoding: [0x01,0x02,0x00,0x8c]
@@ -2738,7 +2748,8 @@ s_or_b64 s[0:1], 0x3f717273, s[4:5]
s_or_b64 s[0:1], 0xaf123456, s[4:5]
// GFX1200: s_or_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xff,0x04,0x80,0x8c,0x56,0x34,0x12,0xaf]
-// GFX1250: s_or_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x8c,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_or_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x8c,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_or_b64 s[0:1], lit64(0xaf123456), s[4:5] ; encoding: [0xfe,0x04,0x80,0x8c,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_or_b64 s[0:1], s[2:3], exec
// GFX12: s_or_b64 s[0:1], s[2:3], exec ; encoding: [0x02,0x7e,0x80,0x8c]
@@ -2763,7 +2774,8 @@ s_or_b64 s[0:1], s[2:3], 0x3f717273
s_or_b64 s[0:1], s[2:3], 0xaf123456
// GFX1200: s_or_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xff,0x80,0x8c,0x56,0x34,0x12,0xaf]
-// GFX1250: s_or_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x8c,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_or_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x8c,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_or_b64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0x8c,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_xor_b32 s0, s1, s2
// GFX12: s_xor_b32 s0, s1, s2 ; encoding: [0x01,0x02,0x00,0x8d]
@@ -2923,7 +2935,8 @@ s_xor_b64 s[0:1], 0x3f717273, s[4:5]
s_xor_b64 s[0:1], 0xaf123456, s[4:5]
// GFX1200: s_xor_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xff,0x04,0x80,0x8d,0x56,0x34,0x12,0xaf]
-// GFX1250: s_xor_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x8d,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_xor_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x8d,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_xor_b64 s[0:1], lit64(0xaf123456), s[4:5] ; encoding: [0xfe,0x04,0x80,0x8d,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_xor_b64 s[0:1], s[2:3], exec
// GFX12: s_xor_b64 s[0:1], s[2:3], exec ; encoding: [0x02,0x7e,0x80,0x8d]
@@ -2948,7 +2961,8 @@ s_xor_b64 s[0:1], s[2:3], 0x3f717273
s_xor_b64 s[0:1], s[2:3], 0xaf123456
// GFX1200: s_xor_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xff,0x80,0x8d,0x56,0x34,0x12,0xaf]
-// GFX1250: s_xor_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x8d,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_xor_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x8d,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_xor_b64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0x8d,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_andn2_b32 s0, s1, s2
// GFX12: s_and_not1_b32 s0, s1, s2 ; encoding: [0x01,0x02,0x00,0x91]
@@ -3108,7 +3122,8 @@ s_andn2_b64 s[0:1], 0x3f717273, s[4:5]
s_andn2_b64 s[0:1], 0xaf123456, s[4:5]
// GFX1200: s_and_not1_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xff,0x04,0x80,0x91,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_not1_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_not1_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_not1_b64 s[0:1], lit64(0xaf123456), s[4:5] ; encoding: [0xfe,0x04,0x80,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_andn2_b64 s[0:1], s[2:3], exec
// GFX12: s_and_not1_b64 s[0:1], s[2:3], exec ; encoding: [0x02,0x7e,0x80,0x91]
@@ -3133,7 +3148,8 @@ s_andn2_b64 s[0:1], s[2:3], 0x3f717273
s_andn2_b64 s[0:1], s[2:3], 0xaf123456
// GFX1200: s_and_not1_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xff,0x80,0x91,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_not1_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_not1_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_not1_b64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_orn2_b32 s0, s1, s2
// GFX12: s_or_not1_b32 s0, s1, s2 ; encoding: [0x01,0x02,0x00,0x92]
@@ -3293,7 +3309,8 @@ s_orn2_b64 s[0:1], 0x3f717273, s[4:5]
s_orn2_b64 s[0:1], 0xaf123456, s[4:5]
// GFX1200: s_or_not1_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xff,0x04,0x80,0x92,0x56,0x34,0x12,0xaf]
-// GFX1250: s_or_not1_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_or_not1_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_or_not1_b64 s[0:1], lit64(0xaf123456), s[4:5] ; encoding: [0xfe,0x04,0x80,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_orn2_b64 s[0:1], s[2:3], exec
// GFX12: s_or_not1_b64 s[0:1], s[2:3], exec ; encoding: [0x02,0x7e,0x80,0x92]
@@ -3318,7 +3335,8 @@ s_orn2_b64 s[0:1], s[2:3], 0x3f717273
s_orn2_b64 s[0:1], s[2:3], 0xaf123456
// GFX1200: s_or_not1_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xff,0x80,0x92,0x56,0x34,0x12,0xaf]
-// GFX1250: s_or_not1_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_or_not1_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_or_not1_b64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_nand_b32 s0, s1, s2
// GFX12: s_nand_b32 s0, s1, s2 ; encoding: [0x01,0x02,0x00,0x8e]
@@ -3478,7 +3496,8 @@ s_nand_b64 s[0:1], 0x3f717273, s[4:5]
s_nand_b64 s[0:1], 0xaf123456, s[4:5]
// GFX1200: s_nand_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xff,0x04,0x80,0x8e,0x56,0x34,0x12,0xaf]
-// GFX1250: s_nand_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x8e,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_nand_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x8e,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_nand_b64 s[0:1], lit64(0xaf123456), s[4:5] ; encoding: [0xfe,0x04,0x80,0x8e,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_nand_b64 s[0:1], s[2:3], exec
// GFX12: s_nand_b64 s[0:1], s[2:3], exec ; encoding: [0x02,0x7e,0x80,0x8e]
@@ -3503,7 +3522,8 @@ s_nand_b64 s[0:1], s[2:3], 0x3f717273
s_nand_b64 s[0:1], s[2:3], 0xaf123456
// GFX1200: s_nand_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xff,0x80,0x8e,0x56,0x34,0x12,0xaf]
-// GFX1250: s_nand_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x8e,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_nand_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x8e,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_nand_b64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0x8e,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_nor_b32 s0, s1, s2
// GFX12: s_nor_b32 s0, s1, s2 ; encoding: [0x01,0x02,0x00,0x8f]
@@ -3663,7 +3683,8 @@ s_nor_b64 s[0:1], 0x3f717273, s[4:5]
s_nor_b64 s[0:1], 0xaf123456, s[4:5]
// GFX1200: s_nor_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xff,0x04,0x80,0x8f,0x56,0x34,0x12,0xaf]
-// GFX1250: s_nor_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x8f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_nor_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x8f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_nor_b64 s[0:1], lit64(0xaf123456), s[4:5] ; encoding: [0xfe,0x04,0x80,0x8f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_nor_b64 s[0:1], s[2:3], exec
// GFX12: s_nor_b64 s[0:1], s[2:3], exec ; encoding: [0x02,0x7e,0x80,0x8f]
@@ -3688,7 +3709,8 @@ s_nor_b64 s[0:1], s[2:3], 0x3f717273
s_nor_b64 s[0:1], s[2:3], 0xaf123456
// GFX1200: s_nor_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xff,0x80,0x8f,0x56,0x34,0x12,0xaf]
-// GFX1250: s_nor_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x8f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_nor_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x8f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_nor_b64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0x8f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_xnor_b32 s0, s1, s2
// GFX12: s_xnor_b32 s0, s1, s2 ; encoding: [0x01,0x02,0x00,0x90]
@@ -3848,7 +3870,8 @@ s_xnor_b64 s[0:1], 0x3f717273, s[4:5]
s_xnor_b64 s[0:1], 0xaf123456, s[4:5]
// GFX1200: s_xnor_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xff,0x04,0x80,0x90,0x56,0x34,0x12,0xaf]
-// GFX1250: s_xnor_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x90,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_xnor_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x90,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_xnor_b64 s[0:1], lit64(0xaf123456), s[4:5] ; encoding: [0xfe,0x04,0x80,0x90,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_xnor_b64 s[0:1], s[2:3], exec
// GFX12: s_xnor_b64 s[0:1], s[2:3], exec ; encoding: [0x02,0x7e,0x80,0x90]
@@ -3873,7 +3896,8 @@ s_xnor_b64 s[0:1], s[2:3], 0x3f717273
s_xnor_b64 s[0:1], s[2:3], 0xaf123456
// GFX1200: s_xnor_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xff,0x80,0x90,0x56,0x34,0x12,0xaf]
-// GFX1250: s_xnor_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x90,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_xnor_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x90,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_xnor_b64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0x90,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_lshl_b32 s0, s1, s2
// GFX12: s_lshl_b32 s0, s1, s2 ; encoding: [0x01,0x02,0x00,0x84]
@@ -4033,7 +4057,8 @@ s_lshl_b64 s[0:1], 0x3f717273, s4
s_lshl_b64 s[0:1], 0xaf123456, s4
// GFX1200: s_lshl_b64 s[0:1], 0xaf123456, s4 ; encoding: [0xff,0x04,0x80,0x84,0x56,0x34,0x12,0xaf]
-// GFX1250: s_lshl_b64 s[0:1], 0xaf123456, s4 ; encoding: [0xfe,0x04,0x80,0x84,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_lshl_b64 s[0:1], 0xaf123456, s4 ; encoding: [0xfe,0x04,0x80,0x84,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_lshl_b64 s[0:1], lit64(0xaf123456), s4 ; encoding: [0xfe,0x04,0x80,0x84,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_lshl_b64 s[0:1], s[2:3], exec_lo
// GFX12: s_lshl_b64 s[0:1], s[2:3], exec_lo ; encoding: [0x02,0x7e,0x80,0x84]
@@ -4217,7 +4242,8 @@ s_lshr_b64 s[0:1], 0x3f717273, s4
s_lshr_b64 s[0:1], 0xaf123456, s4
// GFX1200: s_lshr_b64 s[0:1], 0xaf123456, s4 ; encoding: [0xff,0x04,0x80,0x85,0x56,0x34,0x12,0xaf]
-// GFX1250: s_lshr_b64 s[0:1], 0xaf123456, s4 ; encoding: [0xfe,0x04,0x80,0x85,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_lshr_b64 s[0:1], 0xaf123456, s4 ; encoding: [0xfe,0x04,0x80,0x85,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_lshr_b64 s[0:1], lit64(0xaf123456), s4 ; encoding: [0xfe,0x04,0x80,0x85,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_lshr_b64 s[0:1], s[2:3], exec_lo
// GFX12: s_lshr_b64 s[0:1], s[2:3], exec_lo ; encoding: [0x02,0x7e,0x80,0x85]
@@ -4401,7 +4427,8 @@ s_ashr_i64 s[0:1], 0x3f717273, s4
s_ashr_i64 s[0:1], 0xaf123456, s4
// GFX1200: s_ashr_i64 s[0:1], 0xaf123456, s4 ; encoding: [0xff,0x04,0x80,0x86,0x56,0x34,0x12,0xaf]
-// GFX1250: s_ashr_i64 s[0:1], 0xaf123456, s4 ; encoding: [0xfe,0x04,0x80,0x86,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_ashr_i64 s[0:1], 0xaf123456, s4 ; encoding: [0xfe,0x04,0x80,0x86,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_ashr_i64 s[0:1], lit64(0xaf123456), s4 ; encoding: [0xfe,0x04,0x80,0x86,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_ashr_i64 s[0:1], s[2:3], exec_lo
// GFX12: s_ashr_i64 s[0:1], s[2:3], exec_lo ; encoding: [0x02,0x7e,0x80,0x86]
@@ -4996,7 +5023,8 @@ s_bfe_u64 s[0:1], 0x3f717273, s4
s_bfe_u64 s[0:1], 0xaf123456, s4
// GFX1200: s_bfe_u64 s[0:1], 0xaf123456, s4 ; encoding: [0xff,0x04,0x00,0x94,0x56,0x34,0x12,0xaf]
-// GFX1250: s_bfe_u64 s[0:1], 0xaf123456, s4 ; encoding: [0xfe,0x04,0x00,0x94,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_bfe_u64 s[0:1], 0xaf123456, s4 ; encoding: [0xfe,0x04,0x00,0x94,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_bfe_u64 s[0:1], lit64(0xaf123456), s4 ; encoding: [0xfe,0x04,0x00,0x94,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_bfe_u64 s[0:1], s[2:3], exec_lo
// GFX12: s_bfe_u64 s[0:1], s[2:3], exec_lo ; encoding: [0x02,0x7e,0x00,0x94]
@@ -5075,7 +5103,8 @@ s_bfe_i64 s[0:1], 0x3f717273, s4
s_bfe_i64 s[0:1], 0xaf123456, s4
// GFX1200: s_bfe_i64 s[0:1], 0xaf123456, s4 ; encoding: [0xff,0x04,0x80,0x94,0x56,0x34,0x12,0xaf]
-// GFX1250: s_bfe_i64 s[0:1], 0xaf123456, s4 ; encoding: [0xfe,0x04,0x80,0x94,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_bfe_i64 s[0:1], 0xaf123456, s4 ; encoding: [0xfe,0x04,0x80,0x94,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_bfe_i64 s[0:1], lit64(0xaf123456), s4 ; encoding: [0xfe,0x04,0x80,0x94,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_bfe_i64 s[0:1], s[2:3], exec_lo
// GFX12: s_bfe_i64 s[0:1], s[2:3], exec_lo ; encoding: [0x02,0x7e,0x80,0x94]
@@ -6279,7 +6308,8 @@ s_and_not1_b64 s[10:11], vcc, ttmp[14:15]
s_and_not1_b64 s[10:11], ttmp[14:15], 0xaf123456
// GFX1200: s_and_not1_b64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x7a,0xff,0x8a,0x91,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_not1_b64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x7a,0xfe,0x8a,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_not1_b64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x7a,0xfe,0x8a,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_not1_b64 s[10:11], ttmp[14:15], lit64(0xaf123456) ; encoding: [0x7a,0xfe,0x8a,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_and_not1_b64 s[10:11], exec, src_scc
// GFX12: s_and_not1_b64 s[10:11], exec, src_scc ; encoding: [0x7e,0xfd,0x8a,0x91]
@@ -6298,7 +6328,8 @@ s_and_not1_b64 exec, src_scc, exec
s_and_not1_b64 null, 0xaf123456, vcc
// GFX1200: s_and_not1_b64 null, 0xaf123456, vcc ; encoding: [0xff,0x6a,0xfc,0x91,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_not1_b64 null, 0xaf123456, vcc ; encoding: [0xfe,0x6a,0xfc,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_not1_b64 null, 0xaf123456, vcc ; encoding: [0xfe,0x6a,0xfc,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_not1_b64 null, lit64(0xaf123456), vcc ; encoding: [0xfe,0x6a,0xfc,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_or_not1_b64 s[10:11], s[2:3], s[4:5]
// GFX12: s_or_not1_b64 s[10:11], s[2:3], s[4:5] ; encoding: [0x02,0x04,0x8a,0x92]
@@ -6311,7 +6342,8 @@ s_or_not1_b64 s[10:11], vcc, ttmp[14:15]
s_or_not1_b64 s[10:11], ttmp[14:15], 0xaf123456
// GFX1200: s_or_not1_b64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x7a,0xff,0x8a,0x92,0x56,0x34,0x12,0xaf]
-// GFX1250: s_or_not1_b64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x7a,0xfe,0x8a,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_or_not1_b64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x7a,0xfe,0x8a,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_or_not1_b64 s[10:11], ttmp[14:15], lit64(0xaf123456) ; encoding: [0x7a,0xfe,0x8a,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_or_not1_b64 s[10:11], exec, src_scc
// GFX12: s_or_not1_b64 s[10:11], exec, src_scc ; encoding: [0x7e,0xfd,0x8a,0x92]
@@ -6330,4 +6362,5 @@ s_or_not1_b64 exec, src_scc, exec
s_or_not1_b64 null, 0xaf123456, vcc
// GFX1200: s_or_not1_b64 null, 0xaf123456, vcc ; encoding: [0xff,0x6a,0xfc,0x92,0x56,0x34,0x12,0xaf]
-// GFX1250: s_or_not1_b64 null, 0xaf123456, vcc ; encoding: [0xfe,0x6a,0xfc,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_or_not1_b64 null, 0xaf123456, vcc ; encoding: [0xfe,0x6a,0xfc,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_or_not1_b64 null, lit64(0xaf123456), vcc ; encoding: [0xfe,0x6a,0xfc,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_sopc.s b/llvm/test/MC/AMDGPU/gfx12_asm_sopc.s
index 98bb3c3..8056cef 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_sopc.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_sopc.s
@@ -1,7 +1,7 @@
// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck --check-prefixes=GFX12,GFX1200 %s
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | FileCheck --check-prefixes=GFX12,GFX1250 %s
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding | FileCheck --check-prefixes=GFX1250 %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | FileCheck --check-prefixes=GFX12,GFX1250-ASM %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding | FileCheck --check-prefixes=GFX12,GFX1250-DIS %s
s_cmp_lt_f32 s1, s2
// GFX12: s_cmp_lt_f32 s1, s2 ; encoding: [0x01,0x02,0x41,0xbf]
@@ -2120,7 +2120,8 @@ s_cmp_eq_u64 s[0:1], 0x3f717273
s_cmp_eq_u64 s[0:1], 0xaf123456
// GFX1200: s_cmp_eq_u64 s[0:1], 0xaf123456 ; encoding: [0x00,0xff,0x10,0xbf,0x56,0x34,0x12,0xaf]
-// GFX1250: s_cmp_eq_u64 s[0:1], 0xaf123456 ; encoding: [0x00,0xfe,0x10,0xbf,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_cmp_eq_u64 s[0:1], 0xaf123456 ; encoding: [0x00,0xfe,0x10,0xbf,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_cmp_eq_u64 s[0:1], lit64(0xaf123456) ; encoding: [0x00,0xfe,0x10,0xbf,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
s_cmp_lg_u64 s[0:1], s[2:3]
// GFX12: s_cmp_lg_u64 s[0:1], s[2:3] ; encoding: [0x00,0x02,0x11,0xbf]
@@ -2163,4 +2164,5 @@ s_cmp_lg_u64 s[0:1], 0x3f717273
s_cmp_lg_u64 s[0:1], 0xaf123456
// GFX1200: s_cmp_lg_u64 s[0:1], 0xaf123456 ; encoding: [0x00,0xff,0x11,0xbf,0x56,0x34,0x12,0xaf]
-// GFX1250: s_cmp_lg_u64 s[0:1], 0xaf123456 ; encoding: [0x00,0xfe,0x11,0xbf,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_cmp_lg_u64 s[0:1], 0xaf123456 ; encoding: [0x00,0xfe,0x11,0xbf,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_cmp_lg_u64 s[0:1], lit64(0xaf123456) ; encoding: [0x00,0xfe,0x11,0xbf,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/lit.local.cfg b/llvm/test/MC/AMDGPU/lit.local.cfg
index c5853ad..12a5c8a 100644
--- a/llvm/test/MC/AMDGPU/lit.local.cfg
+++ b/llvm/test/MC/AMDGPU/lit.local.cfg
@@ -1,4 +1,4 @@
-config.substitutions.append(("%extract-encodings", "sed 's/.*encoding://p'"))
+config.substitutions.append(("%extract-encodings", "sed -n 's/.*encoding://p'"))
if not "AMDGPU" in config.root.targets:
config.unsupported = True
diff --git a/llvm/test/MC/AMDGPU/offset-expr.s b/llvm/test/MC/AMDGPU/offset-expr.s
index 92a9bf1b..7c3c71c 100644
--- a/llvm/test/MC/AMDGPU/offset-expr.s
+++ b/llvm/test/MC/AMDGPU/offset-expr.s
@@ -9,10 +9,10 @@ BB1:
v_nop_e64
BB2:
s_add_u32 vcc_lo, vcc_lo, (BB2-BB1)&4294967295
-// CHECK: s_add_u32 vcc_lo, vcc_lo, 8 // 000000000018: 806AFF6A 00000008
+// CHECK: s_add_u32 vcc_lo, vcc_lo, lit(0x8) // 000000000018: 806AFF6A 00000008
s_addc_u32 vcc_hi, vcc_hi, (BB2-BB1)>>32
-// CHECK: s_addc_u32 vcc_hi, vcc_hi, 0 // 000000000020: 826BFF6B 00000000
+// CHECK: s_addc_u32 vcc_hi, vcc_hi, lit(0x0) // 000000000020: 826BFF6B 00000000
s_add_u32 vcc_lo, vcc_lo, (BB0-BB1)&4294967295
-// CHECK: s_add_u32 vcc_lo, vcc_lo, -16 // 000000000028: 806AFF6A FFFFFFF0
+// CHECK: s_add_u32 vcc_lo, vcc_lo, lit(0xfffffff0) // 000000000028: 806AFF6A FFFFFFF0
s_addc_u32 vcc_hi, vcc_hi, (BB0-BB1)>>32
-// CHECK: s_addc_u32 vcc_hi, vcc_hi, -1 // 000000000030: 826BFF6B FFFFFFFF
+// CHECK: s_addc_u32 vcc_hi, vcc_hi, lit(0xffffffff) // 000000000030: 826BFF6B FFFFFFFF
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx8-literal16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx8-literal16.txt
index d2da087..856d7c2 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx8-literal16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx8-literal16.txt
@@ -40,8 +40,7 @@
# VI: v_add_f16_e32 v1, 0x41, v3 ; encoding: [0xff,0x06,0x02,0x3e,0x41,0x00,0x00,0x00]
0xff 0x06 0x02 0x3e 0x41 0x00 0x00 0x01
-# FIXME: This should be able to round trip with literal after instruction
-# VI: v_add_f16_e32 v1, 0, v3 ; encoding: [0x80,0x06,0x02,0x3e]
+# VI: v_add_f16_e32 v1, lit(0x0), v3 ; encoding: [0xff,0x06,0x02,0x3e,0x00,0x00,0x00,0x00]
0xff 0x06 0x02 0x3e 0x00 0x00 0x00 0x00
# VI: v_add_f16_e32 v1, 0xffcd, v3 ; encoding: [0xff,0x06,0x02,0x3e,0xcd,0xff,0x00,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx8_vop3cx_nowarn.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx8_vop3cx_nowarn.txt
new file mode 100644
index 0000000..d4888ad
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx8_vop3cx_nowarn.txt
@@ -0,0 +1,422 @@
+# RUN: llvm-mc -triple=amdgcn -mcpu=tonga -disassemble -show-encoding < %s | FileCheck -strict-whitespace %s
+
+# In GFX10+, v_cmpx_* use EXEC as the implicit dst. The disassembler issues a warning when the dst
+# is not 0x7e (EXEC). In GFX9 and earlier, these instructions have explicit dst. Therefore, such
+# warnings should not be issued.
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_class_f32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0x11,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0x11,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_class_f32_e64 flat_scratch, v1, v2 ; encoding: [0x66,0x00,0x11,0xd0,0x01,0x05,0x02,0x00]
+0x66,0x00,0x11,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_class_f32_e64 vcc, v1, v2 ; encoding: [0x6a,0x00,0x11,0xd0,0x01,0x05,0x02,0x00]
+0x6a,0x00,0x11,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_class_f32_e64 tba, v1, v2 ; encoding: [0x6c,0x00,0x11,0xd0,0x01,0x05,0x02,0x00]
+0x6c,0x00,0x11,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_class_f32_e64 tma, v1, v2 ; encoding: [0x6e,0x00,0x11,0xd0,0x01,0x05,0x02,0x00]
+0x6e,0x00,0x11,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_class_f32_e64 ttmp[10:11], v1, v2 ; encoding: [0x7a,0x00,0x11,0xd0,0x01,0x05,0x02,0x00]
+0x7a,0x00,0x11,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_class_f64_e64 s[0:1], v[1:2], v2 ; encoding: [0x00,0x00,0x13,0xd0,0x01,0x05,0x02,0x00]
+0x00,0x00,0x13,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_class_f16_e64 s[2:3], v1, v2 ; encoding: [0x02,0x00,0x15,0xd0,0x01,0x05,0x02,0x00]
+0x02,0x00,0x15,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_f16_e64 s[4:5], v1, v2 ; encoding: [0x04,0x00,0x30,0xd0,0x01,0x05,0x02,0x00]
+0x04,0x00,0x30,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_f16_e64 s[6:7], v1, v2 ; encoding: [0x06,0x00,0x31,0xd0,0x01,0x05,0x02,0x00]
+0x06,0x00,0x31,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_f16_e64 s[8:9], v1, v2 ; encoding: [0x08,0x00,0x32,0xd0,0x01,0x05,0x02,0x00]
+0x08,0x00,0x32,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_f16_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0x33,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0x33,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_f16_e64 s[12:13], v1, v2 ; encoding: [0x0c,0x00,0x34,0xd0,0x01,0x05,0x02,0x00]
+0x0c,0x00,0x34,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lg_f16_e64 s[14:15], v1, v2 ; encoding: [0x0e,0x00,0x35,0xd0,0x01,0x05,0x02,0x00]
+0x0e,0x00,0x35,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_f16_e64 s[16:17], v1, v2 ; encoding: [0x10,0x00,0x36,0xd0,0x01,0x05,0x02,0x00]
+0x10,0x00,0x36,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_o_f16_e64 s[18:19], v1, v2 ; encoding: [0x12,0x00,0x37,0xd0,0x01,0x05,0x02,0x00]
+0x12,0x00,0x37,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_u_f16_e64 s[20:21], v1, v2 ; encoding: [0x14,0x00,0x38,0xd0,0x01,0x05,0x02,0x00]
+0x14,0x00,0x38,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nge_f16_e64 s[22:23], v1, v2 ; encoding: [0x16,0x00,0x39,0xd0,0x01,0x05,0x02,0x00]
+0x16,0x00,0x39,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nlg_f16_e64 s[24:25], v1, v2 ; encoding: [0x18,0x00,0x3a,0xd0,0x01,0x05,0x02,0x00]
+0x18,0x00,0x3a,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ngt_f16_e64 s[26:27], v1, v2 ; encoding: [0x1a,0x00,0x3b,0xd0,0x01,0x05,0x02,0x00]
+0x1a,0x00,0x3b,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nle_f16_e64 s[28:29], v1, v2 ; encoding: [0x1c,0x00,0x3c,0xd0,0x01,0x05,0x02,0x00]
+0x1c,0x00,0x3c,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_neq_f16_e64 s[30:31], v1, v2 ; encoding: [0x1e,0x00,0x3d,0xd0,0x01,0x05,0x02,0x00]
+0x1e,0x00,0x3d,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nlt_f16_e64 s[32:33], v1, v2 ; encoding: [0x20,0x00,0x3e,0xd0,0x01,0x05,0x02,0x00]
+0x20,0x00,0x3e,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_tru_f16_e64 s[34:35], v1, v2 ; encoding: [0x22,0x00,0x3f,0xd0,0x01,0x05,0x02,0x00]
+0x22,0x00,0x3f,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_f32_e64 s[36:37], v1, v2 ; encoding: [0x24,0x00,0x50,0xd0,0x01,0x05,0x02,0x00]
+0x24,0x00,0x50,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_f32_e64 s[38:39], v1, v2 ; encoding: [0x26,0x00,0x51,0xd0,0x01,0x05,0x02,0x00]
+0x26,0x00,0x51,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_f32_e64 s[40:41], v1, v2 ; encoding: [0x28,0x00,0x52,0xd0,0x01,0x05,0x02,0x00]
+0x28,0x00,0x52,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_f32_e64 s[42:43], v1, v2 ; encoding: [0x2a,0x00,0x53,0xd0,0x01,0x05,0x02,0x00]
+0x2a,0x00,0x53,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_f32_e64 s[44:45], v1, v2 ; encoding: [0x2c,0x00,0x54,0xd0,0x01,0x05,0x02,0x00]
+0x2c,0x00,0x54,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lg_f32_e64 s[46:47], v1, v2 ; encoding: [0x2e,0x00,0x55,0xd0,0x01,0x05,0x02,0x00]
+0x2e,0x00,0x55,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_f32_e64 s[48:49], v1, v2 ; encoding: [0x30,0x00,0x56,0xd0,0x01,0x05,0x02,0x00]
+0x30,0x00,0x56,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_o_f32_e64 s[50:51], v1, v2 ; encoding: [0x32,0x00,0x57,0xd0,0x01,0x05,0x02,0x00]
+0x32,0x00,0x57,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_u_f32_e64 s[52:53], v1, v2 ; encoding: [0x34,0x00,0x58,0xd0,0x01,0x05,0x02,0x00]
+0x34,0x00,0x58,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nge_f32_e64 s[54:55], v1, v2 ; encoding: [0x36,0x00,0x59,0xd0,0x01,0x05,0x02,0x00]
+0x36,0x00,0x59,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nlg_f32_e64 s[56:57], v1, v2 ; encoding: [0x38,0x00,0x5a,0xd0,0x01,0x05,0x02,0x00]
+0x38,0x00,0x5a,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ngt_f32_e64 s[58:59], v1, v2 ; encoding: [0x3a,0x00,0x5b,0xd0,0x01,0x05,0x02,0x00]
+0x3a,0x00,0x5b,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nle_f32_e64 s[60:61], v1, v2 ; encoding: [0x3c,0x00,0x5c,0xd0,0x01,0x05,0x02,0x00]
+0x3c,0x00,0x5c,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_neq_f32_e64 s[62:63], v1, v2 ; encoding: [0x3e,0x00,0x5d,0xd0,0x01,0x05,0x02,0x00]
+0x3e,0x00,0x5d,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nlt_f32_e64 s[64:65], v1, v2 ; encoding: [0x40,0x00,0x5e,0xd0,0x01,0x05,0x02,0x00]
+0x40,0x00,0x5e,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_tru_f32_e64 s[66:67], v1, v2 ; encoding: [0x42,0x00,0x5f,0xd0,0x01,0x05,0x02,0x00]
+0x42,0x00,0x5f,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_f64_e64 s[68:69], v[1:2], v[2:3] ; encoding: [0x44,0x00,0x70,0xd0,0x01,0x05,0x02,0x00]
+0x44,0x00,0x70,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_f64_e64 s[70:71], v[1:2], v[2:3] ; encoding: [0x46,0x00,0x71,0xd0,0x01,0x05,0x02,0x00]
+0x46,0x00,0x71,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_f64_e64 s[72:73], v[1:2], v[2:3] ; encoding: [0x48,0x00,0x72,0xd0,0x01,0x05,0x02,0x00]
+0x48,0x00,0x72,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_f64_e64 s[74:75], v[1:2], v[2:3] ; encoding: [0x4a,0x00,0x73,0xd0,0x01,0x05,0x02,0x00]
+0x4a,0x00,0x73,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_f64_e64 s[76:77], v[1:2], v[2:3] ; encoding: [0x4c,0x00,0x74,0xd0,0x01,0x05,0x02,0x00]
+0x4c,0x00,0x74,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lg_f64_e64 s[78:79], v[1:2], v[2:3] ; encoding: [0x4e,0x00,0x75,0xd0,0x01,0x05,0x02,0x00]
+0x4e,0x00,0x75,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_f64_e64 s[80:81], v[1:2], v[2:3] ; encoding: [0x50,0x00,0x76,0xd0,0x01,0x05,0x02,0x00]
+0x50,0x00,0x76,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_o_f64_e64 s[82:83], v[1:2], v[2:3] ; encoding: [0x52,0x00,0x77,0xd0,0x01,0x05,0x02,0x00]
+0x52,0x00,0x77,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_u_f64_e64 s[84:85], v[1:2], v[2:3] ; encoding: [0x54,0x00,0x78,0xd0,0x01,0x05,0x02,0x00]
+0x54,0x00,0x78,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nge_f64_e64 s[86:87], v[1:2], v[2:3] ; encoding: [0x56,0x00,0x79,0xd0,0x01,0x05,0x02,0x00]
+0x56,0x00,0x79,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nlg_f64_e64 s[88:89], v[1:2], v[2:3] ; encoding: [0x58,0x00,0x7a,0xd0,0x01,0x05,0x02,0x00]
+0x58,0x00,0x7a,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ngt_f64_e64 s[90:91], v[1:2], v[2:3] ; encoding: [0x5a,0x00,0x7b,0xd0,0x01,0x05,0x02,0x00]
+0x5a,0x00,0x7b,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nle_f64_e64 s[92:93], v[1:2], v[2:3] ; encoding: [0x5c,0x00,0x7c,0xd0,0x01,0x05,0x02,0x00]
+0x5c,0x00,0x7c,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_neq_f64_e64 s[94:95], v[1:2], v[2:3] ; encoding: [0x5e,0x00,0x7d,0xd0,0x01,0x05,0x02,0x00]
+0x5e,0x00,0x7d,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nlt_f64_e64 s[96:97], v[1:2], v[2:3] ; encoding: [0x60,0x00,0x7e,0xd0,0x01,0x05,0x02,0x00]
+0x60,0x00,0x7e,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_tru_f64_e64 s[98:99], v[1:2], v[2:3] ; encoding: [0x62,0x00,0x7f,0xd0,0x01,0x05,0x02,0x00]
+0x62,0x00,0x7f,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_i16_e64 s[100:101], v1, v2 ; encoding: [0x64,0x00,0xb0,0xd0,0x01,0x05,0x02,0x00]
+0x64,0x00,0xb0,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_i16_e64 flat_scratch, v1, v2 ; encoding: [0x66,0x00,0xb1,0xd0,0x01,0x05,0x02,0x00]
+0x66,0x00,0xb1,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_i16_e64 xnack_mask, v1, v2 ; encoding: [0x68,0x00,0xb2,0xd0,0x01,0x05,0x02,0x00]
+0x68,0x00,0xb2,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_i16_e64 vcc, v1, v2 ; encoding: [0x6a,0x00,0xb3,0xd0,0x01,0x05,0x02,0x00]
+0x6a,0x00,0xb3,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_i16_e64 tba, v1, v2 ; encoding: [0x6c,0x00,0xb4,0xd0,0x01,0x05,0x02,0x00]
+0x6c,0x00,0xb4,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ne_i16_e64 tma, v1, v2 ; encoding: [0x6e,0x00,0xb5,0xd0,0x01,0x05,0x02,0x00]
+0x6e,0x00,0xb5,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_i16_e64 ttmp[0:1], v1, v2 ; encoding: [0x70,0x00,0xb6,0xd0,0x01,0x05,0x02,0x00]
+0x70,0x00,0xb6,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_t_i16_e64 ttmp[2:3], v1, v2 ; encoding: [0x72,0x00,0xb7,0xd0,0x01,0x05,0x02,0x00]
+0x72,0x00,0xb7,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_u16_e64 ttmp[4:5], v1, v2 ; encoding: [0x74,0x00,0xb8,0xd0,0x01,0x05,0x02,0x00]
+0x74,0x00,0xb8,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_u16_e64 ttmp[6:7], v1, v2 ; encoding: [0x76,0x00,0xb9,0xd0,0x01,0x05,0x02,0x00]
+0x76,0x00,0xb9,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_u16_e64 ttmp[8:9], v1, v2 ; encoding: [0x78,0x00,0xba,0xd0,0x01,0x05,0x02,0x00]
+0x78,0x00,0xba,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_u16_e64 ttmp[10:11], v1, v2 ; encoding: [0x7a,0x00,0xbb,0xd0,0x01,0x05,0x02,0x00]
+0x7a,0x00,0xbb,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_u16_e64 exec, v1, v2 ; encoding: [0x7e,0x00,0xbc,0xd0,0x01,0x05,0x02,0x00]
+0x7e,0x00,0xbc,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ne_u16_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xbd,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xbd,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_u16_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xbe,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xbe,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_t_u16_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xbf,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xbf,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_i32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd0,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd0,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_i32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd1,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd1,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_i32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd2,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd2,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_i32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd3,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd3,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_i32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd4,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd4,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ne_i32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd5,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd5,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_i32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd6,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd6,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_t_i32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd7,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd7,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_u32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd8,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd8,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_u32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd9,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd9,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_u32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xda,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xda,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_u32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xdb,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xdb,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_u32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xdc,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xdc,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ne_u32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xdd,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xdd,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_u32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xde,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xde,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_t_u32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xdf,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xdf,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf0,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf0,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf1,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf1,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf2,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf2,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf3,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf3,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf4,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf4,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ne_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf5,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf5,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf6,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf6,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_t_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf7,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf7,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf8,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf8,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf9,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf9,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xfa,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xfa,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xfb,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xfb,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xfc,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xfc,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ne_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xfd,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xfd,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xfe,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xfe,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_t_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xff,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xff,0xd0,0x01,0x05,0x02,0x00
+
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3c_nowarn.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3c_nowarn.txt
new file mode 100644
index 0000000..0c4f107
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3c_nowarn.txt
@@ -0,0 +1,402 @@
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx900 -disassemble -show-encoding < %s 2>&1 | FileCheck -strict-whitespace %s
+
+# In GFX10+, v_cmpx_* use EXEC as the implicit dst. The disassembler issues a warning when the dst
+# is not 0x7e (EXEC). In GFX9 and earlier, these instructions have explicit dst. Therefore, such
+# warnings should not be issued.
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmp_class_f32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0x10,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0x10,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmp_class_f32_e64 flat_scratch, v1, v2 ; encoding: [0x66,0x00,0x10,0xd0,0x01,0x05,0x02,0x00]
+0x66,0x00,0x10,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmp_class_f32_e64 vcc, v1, v2 ; encoding: [0x6a,0x00,0x10,0xd0,0x01,0x05,0x02,0x00]
+0x6a,0x00,0x10,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_class_f64_e64 s[0:1], v[1:2], v2 ; encoding: [0x00,0x00,0x13,0xd0,0x01,0x05,0x02,0x00]
+0x00,0x00,0x13,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_class_f16_e64 s[2:3], v1, v2 ; encoding: [0x02,0x00,0x15,0xd0,0x01,0x05,0x02,0x00]
+0x02,0x00,0x15,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_f16_e64 s[4:5], v1, v2 ; encoding: [0x04,0x00,0x30,0xd0,0x01,0x05,0x02,0x00]
+0x04,0x00,0x30,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_f16_e64 s[6:7], v1, v2 ; encoding: [0x06,0x00,0x31,0xd0,0x01,0x05,0x02,0x00]
+0x06,0x00,0x31,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_f16_e64 s[8:9], v1, v2 ; encoding: [0x08,0x00,0x32,0xd0,0x01,0x05,0x02,0x00]
+0x08,0x00,0x32,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_f16_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0x33,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0x33,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_f16_e64 s[12:13], v1, v2 ; encoding: [0x0c,0x00,0x34,0xd0,0x01,0x05,0x02,0x00]
+0x0c,0x00,0x34,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lg_f16_e64 s[14:15], v1, v2 ; encoding: [0x0e,0x00,0x35,0xd0,0x01,0x05,0x02,0x00]
+0x0e,0x00,0x35,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_f16_e64 s[16:17], v1, v2 ; encoding: [0x10,0x00,0x36,0xd0,0x01,0x05,0x02,0x00]
+0x10,0x00,0x36,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_o_f16_e64 s[18:19], v1, v2 ; encoding: [0x12,0x00,0x37,0xd0,0x01,0x05,0x02,0x00]
+0x12,0x00,0x37,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_u_f16_e64 s[20:21], v1, v2 ; encoding: [0x14,0x00,0x38,0xd0,0x01,0x05,0x02,0x00]
+0x14,0x00,0x38,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nge_f16_e64 s[22:23], v1, v2 ; encoding: [0x16,0x00,0x39,0xd0,0x01,0x05,0x02,0x00]
+0x16,0x00,0x39,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nlg_f16_e64 s[24:25], v1, v2 ; encoding: [0x18,0x00,0x3a,0xd0,0x01,0x05,0x02,0x00]
+0x18,0x00,0x3a,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ngt_f16_e64 s[26:27], v1, v2 ; encoding: [0x1a,0x00,0x3b,0xd0,0x01,0x05,0x02,0x00]
+0x1a,0x00,0x3b,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nle_f16_e64 s[28:29], v1, v2 ; encoding: [0x1c,0x00,0x3c,0xd0,0x01,0x05,0x02,0x00]
+0x1c,0x00,0x3c,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_neq_f16_e64 s[30:31], v1, v2 ; encoding: [0x1e,0x00,0x3d,0xd0,0x01,0x05,0x02,0x00]
+0x1e,0x00,0x3d,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nlt_f16_e64 s[32:33], v1, v2 ; encoding: [0x20,0x00,0x3e,0xd0,0x01,0x05,0x02,0x00]
+0x20,0x00,0x3e,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_tru_f16_e64 s[34:35], v1, v2 ; encoding: [0x22,0x00,0x3f,0xd0,0x01,0x05,0x02,0x00]
+0x22,0x00,0x3f,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_f32_e64 s[36:37], v1, v2 ; encoding: [0x24,0x00,0x50,0xd0,0x01,0x05,0x02,0x00]
+0x24,0x00,0x50,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_f32_e64 s[38:39], v1, v2 ; encoding: [0x26,0x00,0x51,0xd0,0x01,0x05,0x02,0x00]
+0x26,0x00,0x51,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_f32_e64 s[40:41], v1, v2 ; encoding: [0x28,0x00,0x52,0xd0,0x01,0x05,0x02,0x00]
+0x28,0x00,0x52,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_f32_e64 s[42:43], v1, v2 ; encoding: [0x2a,0x00,0x53,0xd0,0x01,0x05,0x02,0x00]
+0x2a,0x00,0x53,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_f32_e64 s[44:45], v1, v2 ; encoding: [0x2c,0x00,0x54,0xd0,0x01,0x05,0x02,0x00]
+0x2c,0x00,0x54,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lg_f32_e64 s[46:47], v1, v2 ; encoding: [0x2e,0x00,0x55,0xd0,0x01,0x05,0x02,0x00]
+0x2e,0x00,0x55,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_f32_e64 s[48:49], v1, v2 ; encoding: [0x30,0x00,0x56,0xd0,0x01,0x05,0x02,0x00]
+0x30,0x00,0x56,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_o_f32_e64 s[50:51], v1, v2 ; encoding: [0x32,0x00,0x57,0xd0,0x01,0x05,0x02,0x00]
+0x32,0x00,0x57,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_u_f32_e64 s[52:53], v1, v2 ; encoding: [0x34,0x00,0x58,0xd0,0x01,0x05,0x02,0x00]
+0x34,0x00,0x58,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nge_f32_e64 s[54:55], v1, v2 ; encoding: [0x36,0x00,0x59,0xd0,0x01,0x05,0x02,0x00]
+0x36,0x00,0x59,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nlg_f32_e64 s[56:57], v1, v2 ; encoding: [0x38,0x00,0x5a,0xd0,0x01,0x05,0x02,0x00]
+0x38,0x00,0x5a,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ngt_f32_e64 s[58:59], v1, v2 ; encoding: [0x3a,0x00,0x5b,0xd0,0x01,0x05,0x02,0x00]
+0x3a,0x00,0x5b,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nle_f32_e64 s[60:61], v1, v2 ; encoding: [0x3c,0x00,0x5c,0xd0,0x01,0x05,0x02,0x00]
+0x3c,0x00,0x5c,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_neq_f32_e64 s[62:63], v1, v2 ; encoding: [0x3e,0x00,0x5d,0xd0,0x01,0x05,0x02,0x00]
+0x3e,0x00,0x5d,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nlt_f32_e64 s[64:65], v1, v2 ; encoding: [0x40,0x00,0x5e,0xd0,0x01,0x05,0x02,0x00]
+0x40,0x00,0x5e,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_f64_e64 s[66:67], v[1:2], v[2:3] ; encoding: [0x42,0x00,0x70,0xd0,0x01,0x05,0x02,0x00]
+0x42,0x00,0x70,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_f64_e64 s[68:69], v[1:2], v[2:3] ; encoding: [0x44,0x00,0x72,0xd0,0x01,0x05,0x02,0x00]
+0x44,0x00,0x72,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_f64_e64 s[70:71], v[1:2], v[2:3] ; encoding: [0x46,0x00,0x73,0xd0,0x01,0x05,0x02,0x00]
+0x46,0x00,0x73,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_f64_e64 s[72:73], v[1:2], v[2:3] ; encoding: [0x48,0x00,0x74,0xd0,0x01,0x05,0x02,0x00]
+0x48,0x00,0x74,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lg_f64_e64 s[74:75], v[1:2], v[2:3] ; encoding: [0x4a,0x00,0x75,0xd0,0x01,0x05,0x02,0x00]
+0x4a,0x00,0x75,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_f64_e64 s[76:77], v[1:2], v[2:3] ; encoding: [0x4c,0x00,0x76,0xd0,0x01,0x05,0x02,0x00]
+0x4c,0x00,0x76,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_o_f64_e64 s[78:79], v[1:2], v[2:3] ; encoding: [0x4e,0x00,0x77,0xd0,0x01,0x05,0x02,0x00]
+0x4e,0x00,0x77,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_u_f64_e64 s[80:81], v[1:2], v[2:3] ; encoding: [0x50,0x00,0x78,0xd0,0x01,0x05,0x02,0x00]
+0x50,0x00,0x78,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nge_f64_e64 s[82:83], v[1:2], v[2:3] ; encoding: [0x52,0x00,0x79,0xd0,0x01,0x05,0x02,0x00]
+0x52,0x00,0x79,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nlg_f64_e64 s[84:85], v[1:2], v[2:3] ; encoding: [0x54,0x00,0x7a,0xd0,0x01,0x05,0x02,0x00]
+0x54,0x00,0x7a,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ngt_f64_e64 s[86:87], v[1:2], v[2:3] ; encoding: [0x56,0x00,0x7b,0xd0,0x01,0x05,0x02,0x00]
+0x56,0x00,0x7b,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nle_f64_e64 s[88:89], v[1:2], v[2:3] ; encoding: [0x58,0x00,0x7c,0xd0,0x01,0x05,0x02,0x00]
+0x58,0x00,0x7c,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_neq_f64_e64 s[90:91], v[1:2], v[2:3] ; encoding: [0x5a,0x00,0x7d,0xd0,0x01,0x05,0x02,0x00]
+0x5a,0x00,0x7d,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nlt_f64_e64 s[92:93], v[1:2], v[2:3] ; encoding: [0x5c,0x00,0x7e,0xd0,0x01,0x05,0x02,0x00]
+0x5c,0x00,0x7e,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_tru_f64_e64 s[94:95], v[1:2], v[2:3] ; encoding: [0x5e,0x00,0x7f,0xd0,0x01,0x05,0x02,0x00]
+0x5e,0x00,0x7f,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_i16_e64 s[96:97], v1, v2 ; encoding: [0x60,0x00,0xb0,0xd0,0x01,0x05,0x02,0x00]
+0x60,0x00,0xb0,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_i16_e64 s[98:99], v1, v2 ; encoding: [0x62,0x00,0xb1,0xd0,0x01,0x05,0x02,0x00]
+0x62,0x00,0xb1,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_i16_e64 s[100:101], v1, v2 ; encoding: [0x64,0x00,0xb2,0xd0,0x01,0x05,0x02,0x00]
+0x64,0x00,0xb2,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_i16_e64 flat_scratch, v1, v2 ; encoding: [0x66,0x00,0xb3,0xd0,0x01,0x05,0x02,0x00]
+0x66,0x00,0xb3,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_i16_e64 xnack_mask, v1, v2 ; encoding: [0x68,0x00,0xb4,0xd0,0x01,0x05,0x02,0x00]
+0x68,0x00,0xb4,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ne_i16_e64 vcc, v1, v2 ; encoding: [0x6a,0x00,0xb5,0xd0,0x01,0x05,0x02,0x00]
+0x6a,0x00,0xb5,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_i16_e64 ttmp[0:1], v1, v2 ; encoding: [0x6c,0x00,0xb6,0xd0,0x01,0x05,0x02,0x00]
+0x6c,0x00,0xb6,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_t_i16_e64 ttmp[2:3], v1, v2 ; encoding: [0x6e,0x00,0xb7,0xd0,0x01,0x05,0x02,0x00]
+0x6e,0x00,0xb7,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_u16_e64 ttmp[4:5], v1, v2 ; encoding: [0x70,0x00,0xb8,0xd0,0x01,0x05,0x02,0x00]
+0x70,0x00,0xb8,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_u16_e64 ttmp[6:7], v1, v2 ; encoding: [0x72,0x00,0xb9,0xd0,0x01,0x05,0x02,0x00]
+0x72,0x00,0xb9,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_u16_e64 ttmp[8:9], v1, v2 ; encoding: [0x74,0x00,0xba,0xd0,0x01,0x05,0x02,0x00]
+0x74,0x00,0xba,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_u16_e64 ttmp[10:11], v1, v2 ; encoding: [0x76,0x00,0xbb,0xd0,0x01,0x05,0x02,0x00]
+0x76,0x00,0xbb,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_u16_e64 ttmp[12:13], v1, v2 ; encoding: [0x78,0x00,0xbc,0xd0,0x01,0x05,0x02,0x00]
+0x78,0x00,0xbc,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ne_u16_e64 ttmp[14:15], v1, v2 ; encoding: [0x7a,0x00,0xbd,0xd0,0x01,0x05,0x02,0x00]
+0x7a,0x00,0xbd,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_u16_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xbe,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xbe,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_t_u16_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xbf,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xbf,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_i32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd0,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd0,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_i32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd1,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd1,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_i32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd2,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd2,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_i32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd3,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd3,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_i32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd4,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd4,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ne_i32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd5,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd5,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_i32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd6,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd6,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_t_i32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd7,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd7,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_u32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd8,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd8,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_u32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xd9,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd9,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_u32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xda,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xda,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_u32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xdb,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xdb,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_u32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xdc,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xdc,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ne_u32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xdd,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xdd,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_u32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xde,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xde,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_t_u32_e64 s[10:11], v1, v2 ; encoding: [0x0a,0x00,0xdf,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xdf,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf0,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf0,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf1,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf1,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf2,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf2,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf3,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf3,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf4,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf4,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ne_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf5,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf5,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf6,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf6,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_t_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf7,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf7,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf8,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf8,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf9,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf9,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xfa,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xfa,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xfb,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xfb,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xfc,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xfc,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ne_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xfd,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xfd,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xfe,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xfe,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_t_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xff,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xff,0xd0,0x01,0x05,0x02,0x00
+
diff --git a/llvm/test/MC/Disassembler/AMDGPU/literals.txt b/llvm/test/MC/Disassembler/AMDGPU/literals.txt
new file mode 100644
index 0000000..bd013a1
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/literals.txt
@@ -0,0 +1,30 @@
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s
+
+0xff,0x94,0xfe,0x7e,0x01,0x00,0x00,0x00
+# GFX1250: v_tanh_bf16_e32 v127.l, lit(0x1) ; encoding: [0xff,0x94,0xfe,0x7e,0x01,0x00,0x00,0x00]
+
+0xff,0xd3,0x23,0xcc,0xff,0xd6,0x00,0x68,0x01,0x00,0x00,0x00
+# GFX1250: v_pk_add_bf16 v255, lit(0x1), vcc_hi op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[1,1] neg_hi:[1,1] clamp ; encoding: [0xff,0xd3,0x23,0xcc,0xff,0xd6,0x00,0x68,0x01,0x00,0x00,0x00]
+
+0xff,0x3e,0xfe,0x7e,0x01,0x00,0x00,0x00
+# GFX1250: v_tanh_f16_e32 v127.l, lit(0x1) ; encoding: [0xff,0x3e,0xfe,0x7e,0x01,0x00,0x00,0x00]
+
+0xff,0xfe,0xff,0x79,0x01,0x00,0x00,0x00
+# GFX1250: v_pk_fmac_f16 v255, lit(0x1), v255 ; encoding: [0xff,0xfe,0xff,0x79,0x01,0x00,0x00,0x00]
+
+# The immediate is always literal in this instruction.
+0x01,0x00,0x73,0xd7,0xff,0x00,0x00,0x00,0x01,0x00,0x00,0x00
+# GFX1250: v_cvt_pk_bf8_f16 v1.l, 1 ; encoding: [0x01,0x00,0x73,0xd7,0xff,0x00,0x00,0x00,0x01,0x00,0x00,0x00]
+
+0xff,0xec,0x02,0x7e,0x01,0x00,0x00,0x00
+# GFX1250: v_cvt_pk_f16_bf8 v1, lit(0x1) ; encoding: [0xff,0xec,0x02,0x7e,0x01,0x00,0x00,0x00]
+
+0x0a,0x40,0x2d,0xcc,0xff,0x04,0x0e,0x1c,0x01,0x00,0x00,0x00
+# GFX1250: v_pk_add_min_i16 v10, lit(0x1), v2, v3 ; encoding: [0x0a,0x40,0x2d,0xcc,0xff,0x04,0x0e,0x1c,0x01,0x00,0x00,0x00]
+
+0xff,0x3c,0xfe,0x7f,0x01,0x00,0x00,0x00
+# GFX1250: v_tanh_f32_e32 v255, lit(0x1) ; encoding: [0xff,0x3c,0xfe,0x7f,0x01,0x00,0x00,0x00]
+
+0xff,0x3a,0xfc,0x7f,0x01,0x00,0x00,0x00
+# GFX1250: v_mov_b64_e32 v[254:255], lit(0x1) ; encoding: [0xfe,0x3a,0xfc,0x7f,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
diff --git a/llvm/test/MC/Disassembler/X86/apx/pushp-popp.txt b/llvm/test/MC/Disassembler/X86/apx/pushp-popp.txt
index 4ec534f..fa40fe6 100644
--- a/llvm/test/MC/Disassembler/X86/apx/pushp-popp.txt
+++ b/llvm/test/MC/Disassembler/X86/apx/pushp-popp.txt
@@ -17,6 +17,10 @@
# INTEL: pushp r16
0xd5,0x18,0x50
+# ATT: pushq %r16
+# INTEL: push r16
+0xd5,0x10,0x50
+
# ATT: popp %rax
# INTEL: popp rax
0xd5,0x08,0x58
@@ -32,3 +36,7 @@
# ATT: popp %r16
# INTEL: popp r16
0xd5,0x18,0x58
+
+# ATT: popq %r16
+# INTEL: pop r16
+0xd5,0x10,0x58
diff --git a/llvm/test/MC/WebAssembly/simd-encodings.s b/llvm/test/MC/WebAssembly/simd-encodings.s
index 48aec4b..57da338 100644
--- a/llvm/test/MC/WebAssembly/simd-encodings.s
+++ b/llvm/test/MC/WebAssembly/simd-encodings.s
@@ -917,11 +917,11 @@ main:
# CHECK: f16x8.nearest # encoding: [0xfd,0xb6,0x02]
f16x8.nearest
- # CHECK: f16x8.relaxed_madd # encoding: [0xfd,0xce,0x02]
- f16x8.relaxed_madd
+ # CHECK: f16x8.madd # encoding: [0xfd,0xce,0x02]
+ f16x8.madd
- # CHECK: f16x8.relaxed_nmadd # encoding: [0xfd,0xcf,0x02]
- f16x8.relaxed_nmadd
+ # CHECK: f16x8.nmadd # encoding: [0xfd,0xcf,0x02]
+ f16x8.nmadd
# CHECK: i16x8.trunc_sat_f16x8_s # encoding: [0xfd,0xc5,0x02]
i16x8.trunc_sat_f16x8_s
diff --git a/llvm/test/MC/X86/apx/pushp-popp-att.s b/llvm/test/MC/X86/apx/pushp-popp-att.s
index a810744..d638034 100644
--- a/llvm/test/MC/X86/apx/pushp-popp-att.s
+++ b/llvm/test/MC/X86/apx/pushp-popp-att.s
@@ -1,7 +1,7 @@
# RUN: llvm-mc -triple x86_64 -show-encoding %s | FileCheck %s
# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
-# ERROR-COUNT-8: error:
+# ERROR-COUNT-10: error:
# ERROR-NOT: error:
# CHECK: pushp %rax
@@ -16,6 +16,9 @@
# CHECK: pushp %r16
# CHECK: encoding: [0xd5,0x18,0x50]
pushp %r16
+# CHECK: pushq %r16
+# CHECK: encoding: [0xd5,0x10,0x50]
+ pushq %r16
# CHECK: popp %rax
# CHECK: encoding: [0xd5,0x08,0x58]
@@ -29,3 +32,6 @@
# CHECK: popp %r16
# CHECK: encoding: [0xd5,0x18,0x58]
popp %r16
+# CHECK: popq %r16
+# CHECK: encoding: [0xd5,0x10,0x58]
+ popq %r16
diff --git a/llvm/test/MC/X86/verify-callgraph-section.s b/llvm/test/MC/X86/verify-callgraph-section.s
index ce07228..9be5a68 100644
--- a/llvm/test/MC/X86/verify-callgraph-section.s
+++ b/llvm/test/MC/X86/verify-callgraph-section.s
@@ -2,7 +2,7 @@
/// (annotated by generated temporary labels .Ltmp*) are associated
/// with the corresponding callee type identifiers.
-// RUN: llvm-mc -triple=x86_64 -filetype=obj -o - < %s | llvm-readelf -x .callgraph - | FileCheck %s
+// RUN: llvm-mc -triple=x86_64 -filetype=obj -o - < %s | llvm-readelf -x .llvm.callgraph - | FileCheck %s
.text
.globl ball # -- Begin function ball
@@ -38,7 +38,7 @@ ball: # @ball
addq $32, %rsp
popq %rbx
retq
- .section .callgraph,"o",@progbits,.text
+ .section .llvm.callgraph,"o",@progbits,.text
.quad 0
.quad .Lfunc_begin0
.quad 1
diff --git a/llvm/test/Other/debugcounter-dce.ll b/llvm/test/Other/debugcounter-dce.ll
index 54d929f..3b1dfb4 100644
--- a/llvm/test/Other/debugcounter-dce.ll
+++ b/llvm/test/Other/debugcounter-dce.ll
@@ -1,8 +1,16 @@
; REQUIRES: asserts
-; RUN: opt -passes=dce -S -debug-counter=dce-transform=1-2 < %s | FileCheck %s
+; RUN: opt -passes=dce -S -debug-counter=dce-transform=1-2 < %s | FileCheck %s --check-prefixes=CHECK,NO-PRINT
+; RUN: opt -passes=dce -S -debug-counter=dce-transform=1-2 -print-debug-counter-queries < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,PRINT
;; Test that, with debug counters on, we will skip the first DCE opportunity, perform next 2,
;; and ignore all the others left.
+; NO-PRINT-NOT: DebugCounter
+; PRINT: DebugCounter dce-transform=0 skip
+; PRINT-NEXT: DebugCounter dce-transform=1 execute
+; PRINT-NEXT: DebugCounter dce-transform=2 execute
+; PRINT-NEXT: DebugCounter dce-transform=3 skip
+; PRINT-NEXT: DebugCounter dce-transform=4 skip
+
; CHECK-LABEL: @test
; CHECK-NEXT: %add1 = add i32 1, 2
; CHECK-NEXT: %sub1 = sub i32 %add1, 1
diff --git a/llvm/test/TableGen/directive1.td b/llvm/test/TableGen/directive1.td
index 3eda077..475faf9 100644
--- a/llvm/test/TableGen/directive1.td
+++ b/llvm/test/TableGen/directive1.td
@@ -177,6 +177,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
// CHECK-NEXT: static constexpr bool is_iterable = true;
// CHECK-NEXT: };
// CHECK-NEXT: } // namespace llvm
+// CHECK-EMPTY:
// CHECK-NEXT: #endif // LLVM_Tdl_INC
diff --git a/llvm/test/TableGen/directive2.td b/llvm/test/TableGen/directive2.td
index a25197c..ccc0944 100644
--- a/llvm/test/TableGen/directive2.td
+++ b/llvm/test/TableGen/directive2.td
@@ -150,6 +150,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
// CHECK-NEXT: static constexpr bool is_iterable = true;
// CHECK-NEXT: };
// CHECK-NEXT: } // namespace llvm
+// CHECK-EMPTY:
// CHECK-NEXT: #endif // LLVM_Tdl_INC
// IMPL: #ifdef GEN_FLANG_DIRECTIVE_CLAUSE_SETS
diff --git a/llvm/test/TableGen/listsplat.td b/llvm/test/TableGen/listsplat.td
index 5a93a4c..43803d6 100644
--- a/llvm/test/TableGen/listsplat.td
+++ b/llvm/test/TableGen/listsplat.td
@@ -1,4 +1,5 @@
// RUN: llvm-tblgen %s | FileCheck %s
+// RUN: not llvm-tblgen -DERROR1 %s 2>&1 | FileCheck --check-prefix=ERROR1 %s
// CHECK: ------------- Classes -----------------
// CHECK-NEXT: class X<int X:a = ?, int X:b = ?> {
@@ -73,3 +74,8 @@ def DYa1 : Y<"a", 1>;
def DYa2 : Y<"a", 2>;
def DZ : X<42, !size([1, 2, 3])>;
+
+#ifdef ERROR1
+// ERROR1: !listsplat count -1 is negative
+defvar E = !listsplat("", -1);
+#endif
diff --git a/llvm/test/ThinLTO/X86/memprof-supports-hot-cold-new.ll b/llvm/test/ThinLTO/X86/memprof-supports-hot-cold-new.ll
index 7a4d860..fe2a002 100644
--- a/llvm/test/ThinLTO/X86/memprof-supports-hot-cold-new.ll
+++ b/llvm/test/ThinLTO/X86/memprof-supports-hot-cold-new.ll
@@ -17,11 +17,12 @@
; RUN: -r=%t/foo.o,foo,plx \
; RUN: -r=%t/foo.o,_Znam, \
; RUN: -memprof-dump-ccg \
-; RUN: -save-temps \
-; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
+; RUN: -print-before=memprof-context-disambiguation \
+; RUN: -thinlto-threads=1 \
+; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR
+
; DUMP: Callsite Context Graph:
-; RUN: llvm-dis %t.out.1.3.import.bc -o - | FileCheck %s --check-prefix=IR
; IR: @main()
; IR: !memprof {{.*}} !callsite
; IR: @_Znam(i64 0) #[[ATTR:[0-9]+]]
@@ -41,13 +42,12 @@
; RUN: -r=%t/foo.o,foo,plx \
; RUN: -r=%t/foo.o,_Znam, \
; RUN: -memprof-dump-ccg \
-; RUN: -save-temps \
+; RUN: -print-before=memprof-context-disambiguation \
+; RUN: -thinlto-threads=1 \
; RUN: -o %t.out 2>&1 | FileCheck %s --allow-empty \
-; RUN: --implicit-check-not "Callsite Context Graph:"
-
-; RUN: llvm-dis %t.out.1.3.import.bc -o - | FileCheck %s \
-; RUN: --implicit-check-not "!memprof" --implicit-check-not "!callsite" \
-; RUN: --implicit-check-not "memprof"="cold"
+; RUN: --implicit-check-not "Callsite Context Graph:" \
+; RUN: --implicit-check-not "!memprof" --implicit-check-not "!callsite" \
+; RUN: --implicit-check-not "memprof"="cold"
;--- main.ll
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/Transforms/DFAJumpThreading/dfa-constant-propagation.ll b/llvm/test/Transforms/DFAJumpThreading/dfa-constant-propagation.ll
index fdab67a..afc98ce 100644
--- a/llvm/test/Transforms/DFAJumpThreading/dfa-constant-propagation.ll
+++ b/llvm/test/Transforms/DFAJumpThreading/dfa-constant-propagation.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes=dfa-jump-threading,sccp,simplifycfg %s | FileCheck %s
+; RUN: opt -S -passes=dfa-jump-threading,sccp,simplifycfg -verify-dom-info=1 %s | FileCheck %s
; This test checks that a constant propagation is applied for a basic loop.
; Related to bug 44679.
diff --git a/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-analysis.ll b/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-analysis.ll
index f45798b..5076517 100644
--- a/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-analysis.ll
+++ b/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-analysis.ll
@@ -1,6 +1,6 @@
; REQUIRES: asserts
-; RUN: opt -S -passes=dfa-jump-threading -debug-only=dfa-jump-threading -disable-output %s 2>&1 | FileCheck %s
-; RUN: opt -S -passes=dfa-jump-threading -print-prof-data %s -o - | FileCheck %s --check-prefix=PROFILE
+; RUN: opt -S -passes=dfa-jump-threading -verify-dom-info=1 -debug-only=dfa-jump-threading -disable-output %s 2>&1 | FileCheck %s
+; RUN: opt -S -passes=dfa-jump-threading -verify-dom-info=1 -print-prof-data %s -o - | FileCheck %s --check-prefix=PROFILE
; This test checks that the analysis identifies all threadable paths in a
; simple CFG. A threadable path includes a list of basic blocks, the exit
diff --git a/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-transform.ll b/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-transform.ll
index 092c854..426b51e 100644
--- a/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-transform.ll
+++ b/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-transform.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
-; RUN: opt -S -passes=dfa-jump-threading %s | FileCheck %s
+; RUN: opt -S -passes=dfa-jump-threading -verify-dom-info=1 %s | FileCheck %s
; These tests check that the DFA jump threading transformation is applied
; properly to two CFGs. It checks that blocks are cloned, branches are updated,
@@ -445,9 +445,67 @@ bb2: ; preds = %select.unfold
unreachable
}
+
+define i16 @DTU_update_crash() {
+; CHECK-LABEL: @DTU_update_crash(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[FOR_BODY_SELECTBLOCK:%.*]]
+; CHECK: for.body.selectblock:
+; CHECK-NEXT: br i1 false, label [[SWITCHBLOCK_JT0:%.*]], label [[SEL_SI_UNFOLD_FALSE_JT0:%.*]]
+; CHECK: sel.si.unfold.false:
+; CHECK-NEXT: br label [[SWITCHBLOCK:%.*]]
+; CHECK: sel.si.unfold.false.jt0:
+; CHECK-NEXT: [[DOTSI_UNFOLD_PHI_JT0:%.*]] = phi i32 [ 0, [[FOR_BODY_SELECTBLOCK]] ]
+; CHECK-NEXT: br label [[SWITCHBLOCK_JT0]]
+; CHECK: switchblock:
+; CHECK-NEXT: [[SWITCHBLOCK_PHI:%.*]] = phi i32 [ poison, [[SEL_SI_UNFOLD_FALSE:%.*]] ]
+; CHECK-NEXT: [[P_24_ADDR_3:%.*]] = phi i32 [ 0, [[SEL_SI_UNFOLD_FALSE]] ]
+; CHECK-NEXT: switch i32 [[SWITCHBLOCK_PHI]], label [[CLEANUP:%.*]] [
+; CHECK-NEXT: i32 0, label [[FOR_INC:%.*]]
+; CHECK-NEXT: i32 1, label [[CLEANUP]]
+; CHECK-NEXT: i32 5, label [[FOR_BODY_SELECTBLOCK]]
+; CHECK-NEXT: ]
+; CHECK: switchblock.jt0:
+; CHECK-NEXT: [[SWITCHBLOCK_PHI_JT0:%.*]] = phi i32 [ 0, [[FOR_BODY_SELECTBLOCK]] ], [ [[DOTSI_UNFOLD_PHI_JT0]], [[SEL_SI_UNFOLD_FALSE_JT0]] ]
+; CHECK-NEXT: [[P_24_ADDR_3_JT0:%.*]] = phi i32 [ 0, [[FOR_BODY_SELECTBLOCK]] ], [ 0, [[SEL_SI_UNFOLD_FALSE_JT0]] ]
+; CHECK-NEXT: br label [[FOR_INC]]
+; CHECK: for.inc:
+; CHECK-NEXT: br i1 false, label [[FOR_BODY_SELECTBLOCK]], label [[CLEANUP]]
+; CHECK: cleanup:
+; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[P_24_ADDR_3_JT0]])
+; CHECK-NEXT: ret i16 0
+;
+entry:
+ br label %for.body.selectblock
+
+for.body.selectblock: ; preds = %for.inc, %switchblock, %entry
+ %sel = select i1 false, i32 0, i32 0
+ br label %switchblock
+
+switchblock: ; preds = %for.body.selectblock
+ %switchblock.phi = phi i32 [ %sel, %for.body.selectblock ]
+ %p_24.addr.3 = phi i32 [ 0, %for.body.selectblock ]
+ switch i32 %switchblock.phi, label %cleanup [
+ i32 0, label %for.inc
+ i32 1, label %cleanup
+ i32 5, label %for.body.selectblock
+ ]
+
+for.inc: ; preds = %switchblock
+ br i1 false, label %for.body.selectblock, label %cleanup
+
+cleanup: ; preds = %for.inc, %switchblock, %switchblock
+ call void (...) @llvm.fake.use(i32 %p_24.addr.3)
+ ret i16 0
+}
+
+declare void @llvm.fake.use(...)
+
!0 = !{!"function_entry_count", i32 10}
!1 = !{!"branch_weights", i32 3, i32 5}
;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) }
+;.
; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i32 10}
; CHECK: [[PROF1]] = !{!"branch_weights", i32 3, i32 5}
;.
diff --git a/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll b/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll
index de38752..95d3ffa 100644
--- a/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll
+++ b/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes=dfa-jump-threading -dfa-early-exit-heuristic=false %s | FileCheck %s
+; RUN: opt -S -passes=dfa-jump-threading -dfa-early-exit-heuristic=false -verify-dom-info=1 %s | FileCheck %s
; These tests check if selects are unfolded properly for jump threading
; opportunities. There are three different patterns to consider:
diff --git a/llvm/test/Transforms/DFAJumpThreading/equivalent-states.ll b/llvm/test/Transforms/DFAJumpThreading/equivalent-states.ll
index 4555dfb..71a469d 100644
--- a/llvm/test/Transforms/DFAJumpThreading/equivalent-states.ll
+++ b/llvm/test/Transforms/DFAJumpThreading/equivalent-states.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt -S -passes=dfa-jump-threading %s | FileCheck %s
+; RUN: opt -S -passes=dfa-jump-threading -verify-dom-info=1 %s | FileCheck %s
declare void @do_something()
declare void @user(i32)
diff --git a/llvm/test/Transforms/DFAJumpThreading/single_succ_switch.ll b/llvm/test/Transforms/DFAJumpThreading/single_succ_switch.ll
index 00500a7..cc117e7 100644
--- a/llvm/test/Transforms/DFAJumpThreading/single_succ_switch.ll
+++ b/llvm/test/Transforms/DFAJumpThreading/single_succ_switch.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt -S -passes=dfa-jump-threading %s | FileCheck %s
+; RUN: opt -S -passes=dfa-jump-threading -verify-dom-info=1 %s | FileCheck %s
define void @pr60254() {
; CHECK-LABEL: define void @pr60254() {
diff --git a/llvm/test/Transforms/ExpandFp/AMDGPU/frem.ll b/llvm/test/Transforms/ExpandFp/AMDGPU/frem.ll
index d25d0f1..4c0f9db 100644
--- a/llvm/test/Transforms/ExpandFp/AMDGPU/frem.ll
+++ b/llvm/test/Transforms/ExpandFp/AMDGPU/frem.ll
@@ -380,9 +380,9 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX1:%.*]] = fpext half [[AX]] to float
; CHECK-NEXT: [[AY2:%.*]] = fpext half [[AY]] to float
; CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt float [[AX1]], [[AY2]]
-; CHECK-NEXT: br i1 [[TMP3]], label %[[FREM_COMPUTE:.*]], label %[[FREM_ELSE:.*]]
+; CHECK-NEXT: br i1 [[TMP3]], label %[[FREM_COMPUTE19:.*]], label %[[FREM_ELSE20:.*]]
; CHECK: [[BB4:.*]]:
-; CHECK-NEXT: [[RET:%.*]] = phi half [ [[TMP38:%.*]], %[[FREM_LOOP_EXIT:.*]] ], [ [[TMP29:%.*]], %[[FREM_ELSE]] ]
+; CHECK-NEXT: [[RET:%.*]] = phi half [ [[TMP58:%.*]], %[[FREM_LOOP_EXIT28:.*]] ], [ [[TMP57:%.*]], %[[FREM_ELSE20]] ]
; CHECK-NEXT: [[TMP5:%.*]] = fcmp ueq half [[TMP2]], 0xH0000
; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], half 0xH7E00, half [[RET]]
; CHECK-NEXT: [[TMP7:%.*]] = call half @llvm.fabs.f16(half [[TMP1]])
@@ -396,9 +396,9 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX16:%.*]] = fpext half [[AX14]] to float
; CHECK-NEXT: [[AY17:%.*]] = fpext half [[AY15]] to float
; CHECK-NEXT: [[TMP13:%.*]] = fcmp ogt float [[AX16]], [[AY17]]
-; CHECK-NEXT: br i1 [[TMP13]], label %[[FREM_COMPUTE19:.*]], label %[[FREM_ELSE20:.*]]
+; CHECK-NEXT: br i1 [[TMP13]], label %[[FREM_COMPUTE:.*]], label %[[FREM_ELSE:.*]]
; CHECK: [[BB14:.*]]:
-; CHECK-NEXT: [[RET18:%.*]] = phi half [ [[TMP57:%.*]], %[[FREM_LOOP_EXIT28:.*]] ], [ [[TMP48:%.*]], %[[FREM_ELSE20]] ]
+; CHECK-NEXT: [[RET18:%.*]] = phi half [ [[TMP46:%.*]], %[[FREM_LOOP_EXIT:.*]] ], [ [[TMP38:%.*]], %[[FREM_ELSE]] ]
; CHECK-NEXT: [[TMP15:%.*]] = fcmp ueq half [[TMP12]], 0xH0000
; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], half 0xH7E00, half [[RET18]]
; CHECK-NEXT: [[TMP17:%.*]] = call half @llvm.fabs.f16(half [[TMP11]])
@@ -408,12 +408,12 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: store <2 x half> [[R2]], ptr addrspace(1) [[OUT]], align 8
; CHECK-NEXT: ret void
; CHECK: [[FREM_COMPUTE]]:
-; CHECK-NEXT: [[TMP20:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX1]])
+; CHECK-NEXT: [[TMP20:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX16]])
; CHECK-NEXT: [[TMP21:%.*]] = extractvalue { float, i32 } [[TMP20]], 0
; CHECK-NEXT: [[TMP22:%.*]] = extractvalue { float, i32 } [[TMP20]], 1
; CHECK-NEXT: [[EX:%.*]] = sub i32 [[TMP22]], 1
; CHECK-NEXT: [[AX3:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP21]], i32 11)
-; CHECK-NEXT: [[TMP23:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY2]])
+; CHECK-NEXT: [[TMP23:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY17]])
; CHECK-NEXT: [[TMP24:%.*]] = extractvalue { float, i32 } [[TMP23]], 0
; CHECK-NEXT: [[TMP25:%.*]] = extractvalue { float, i32 } [[TMP23]], 1
; CHECK-NEXT: [[EY:%.*]] = sub i32 [[TMP25]], 1
@@ -423,10 +423,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[TMP26:%.*]] = icmp sgt i32 [[NB]], 11
; CHECK-NEXT: br i1 [[TMP26]], label %[[FREM_LOOP_BODY:.*]], label %[[FREM_LOOP_EXIT]]
; CHECK: [[FREM_ELSE]]:
-; CHECK-NEXT: [[TMP27:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP1]])
-; CHECK-NEXT: [[TMP28:%.*]] = fcmp oeq float [[AX1]], [[AY2]]
-; CHECK-NEXT: [[TMP29]] = select i1 [[TMP28]], half [[TMP27]], half [[TMP1]]
-; CHECK-NEXT: br label %[[BB4]]
+; CHECK-NEXT: [[TMP28:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP11]])
+; CHECK-NEXT: [[TMP29:%.*]] = fcmp oeq float [[AX16]], [[AY17]]
+; CHECK-NEXT: [[TMP38]] = select i1 [[TMP29]], half [[TMP28]], half [[TMP11]]
+; CHECK-NEXT: br label %[[BB14]]
; CHECK: [[FREM_LOOP_BODY]]:
; CHECK-NEXT: [[NB_IV:%.*]] = phi i32 [ [[NB]], %[[FREM_COMPUTE]] ], [ [[NB_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ]
; CHECK-NEXT: [[AX_LOOP_PHI:%.*]] = phi float [ [[AX3]], %[[FREM_COMPUTE]] ], [ [[AX_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ]
@@ -456,15 +456,15 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX12:%.*]] = select i1 [[CLT10]], float [[AXP11]], float [[AX9]]
; CHECK-NEXT: [[AX13:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX12]], i32 [[EY]])
; CHECK-NEXT: [[TMP37:%.*]] = fptrunc float [[AX13]] to half
-; CHECK-NEXT: [[TMP38]] = call half @llvm.copysign.f16(half [[TMP37]], half [[TMP1]])
-; CHECK-NEXT: br label %[[BB4]]
+; CHECK-NEXT: [[TMP46]] = call half @llvm.copysign.f16(half [[TMP37]], half [[TMP11]])
+; CHECK-NEXT: br label %[[BB14]]
; CHECK: [[FREM_COMPUTE19]]:
-; CHECK-NEXT: [[TMP39:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX16]])
+; CHECK-NEXT: [[TMP39:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX1]])
; CHECK-NEXT: [[TMP40:%.*]] = extractvalue { float, i32 } [[TMP39]], 0
; CHECK-NEXT: [[TMP41:%.*]] = extractvalue { float, i32 } [[TMP39]], 1
; CHECK-NEXT: [[EX21:%.*]] = sub i32 [[TMP41]], 1
; CHECK-NEXT: [[AX22:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP40]], i32 11)
-; CHECK-NEXT: [[TMP42:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY17]])
+; CHECK-NEXT: [[TMP42:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY2]])
; CHECK-NEXT: [[TMP43:%.*]] = extractvalue { float, i32 } [[TMP42]], 0
; CHECK-NEXT: [[TMP44:%.*]] = extractvalue { float, i32 } [[TMP42]], 1
; CHECK-NEXT: [[EY23:%.*]] = sub i32 [[TMP44]], 1
@@ -474,10 +474,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[TMP45:%.*]] = icmp sgt i32 [[NB25]], 11
; CHECK-NEXT: br i1 [[TMP45]], label %[[FREM_LOOP_BODY27:.*]], label %[[FREM_LOOP_EXIT28]]
; CHECK: [[FREM_ELSE20]]:
-; CHECK-NEXT: [[TMP46:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP11]])
-; CHECK-NEXT: [[TMP47:%.*]] = fcmp oeq float [[AX16]], [[AY17]]
-; CHECK-NEXT: [[TMP48]] = select i1 [[TMP47]], half [[TMP46]], half [[TMP11]]
-; CHECK-NEXT: br label %[[BB14]]
+; CHECK-NEXT: [[TMP47:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP1]])
+; CHECK-NEXT: [[TMP48:%.*]] = fcmp oeq float [[AX1]], [[AY2]]
+; CHECK-NEXT: [[TMP57]] = select i1 [[TMP48]], half [[TMP47]], half [[TMP1]]
+; CHECK-NEXT: br label %[[BB4]]
; CHECK: [[FREM_LOOP_BODY27]]:
; CHECK-NEXT: [[NB_IV29:%.*]] = phi i32 [ [[NB25]], %[[FREM_COMPUTE19]] ], [ [[NB_UPDATE37:%.*]], %[[FREM_LOOP_BODY27]] ]
; CHECK-NEXT: [[AX_LOOP_PHI30:%.*]] = phi float [ [[AX22]], %[[FREM_COMPUTE19]] ], [ [[AX_UPDATE36:%.*]], %[[FREM_LOOP_BODY27]] ]
@@ -507,8 +507,8 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX45:%.*]] = select i1 [[CLT43]], float [[AXP44]], float [[AX42]]
; CHECK-NEXT: [[AX46:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX45]], i32 [[EY23]])
; CHECK-NEXT: [[TMP56:%.*]] = fptrunc float [[AX46]] to half
-; CHECK-NEXT: [[TMP57]] = call half @llvm.copysign.f16(half [[TMP56]], half [[TMP11]])
-; CHECK-NEXT: br label %[[BB14]]
+; CHECK-NEXT: [[TMP58]] = call half @llvm.copysign.f16(half [[TMP56]], half [[TMP1]])
+; CHECK-NEXT: br label %[[BB4]]
;
ptr addrspace(1) %in2) {
%gep2 = getelementptr <2 x half>, ptr addrspace(1) %in2, i32 4
@@ -532,9 +532,9 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX1:%.*]] = fpext half [[AX]] to float
; CHECK-NEXT: [[AY2:%.*]] = fpext half [[AY]] to float
; CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt float [[AX1]], [[AY2]]
-; CHECK-NEXT: br i1 [[TMP3]], label %[[FREM_COMPUTE:.*]], label %[[FREM_ELSE:.*]]
+; CHECK-NEXT: br i1 [[TMP3]], label %[[FREM_COMPUTE85:.*]], label %[[FREM_ELSE86:.*]]
; CHECK: [[BB4:.*]]:
-; CHECK-NEXT: [[RET:%.*]] = phi half [ [[TMP58:%.*]], %[[FREM_LOOP_EXIT:.*]] ], [ [[TMP49:%.*]], %[[FREM_ELSE]] ]
+; CHECK-NEXT: [[RET:%.*]] = phi half [ [[TMP116:%.*]], %[[FREM_LOOP_EXIT94:.*]] ], [ [[TMP115:%.*]], %[[FREM_ELSE86]] ]
; CHECK-NEXT: [[TMP5:%.*]] = fcmp ueq half [[TMP2]], 0xH0000
; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], half 0xH7E00, half [[RET]]
; CHECK-NEXT: [[TMP7:%.*]] = call half @llvm.fabs.f16(half [[TMP1]])
@@ -548,9 +548,9 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX16:%.*]] = fpext half [[AX14]] to float
; CHECK-NEXT: [[AY17:%.*]] = fpext half [[AY15]] to float
; CHECK-NEXT: [[TMP13:%.*]] = fcmp ogt float [[AX16]], [[AY17]]
-; CHECK-NEXT: br i1 [[TMP13]], label %[[FREM_COMPUTE19:.*]], label %[[FREM_ELSE20:.*]]
+; CHECK-NEXT: br i1 [[TMP13]], label %[[FREM_COMPUTE52:.*]], label %[[FREM_ELSE53:.*]]
; CHECK: [[BB14:.*]]:
-; CHECK-NEXT: [[RET18:%.*]] = phi half [ [[TMP77:%.*]], %[[FREM_LOOP_EXIT28:.*]] ], [ [[TMP68:%.*]], %[[FREM_ELSE20]] ]
+; CHECK-NEXT: [[RET18:%.*]] = phi half [ [[TMP104:%.*]], %[[FREM_LOOP_EXIT61:.*]] ], [ [[TMP96:%.*]], %[[FREM_ELSE53]] ]
; CHECK-NEXT: [[TMP15:%.*]] = fcmp ueq half [[TMP12]], 0xH0000
; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], half 0xH7E00, half [[RET18]]
; CHECK-NEXT: [[TMP17:%.*]] = call half @llvm.fabs.f16(half [[TMP11]])
@@ -564,9 +564,9 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX49:%.*]] = fpext half [[AX47]] to float
; CHECK-NEXT: [[AY50:%.*]] = fpext half [[AY48]] to float
; CHECK-NEXT: [[TMP23:%.*]] = fcmp ogt float [[AX49]], [[AY50]]
-; CHECK-NEXT: br i1 [[TMP23]], label %[[FREM_COMPUTE52:.*]], label %[[FREM_ELSE53:.*]]
+; CHECK-NEXT: br i1 [[TMP23]], label %[[FREM_COMPUTE19:.*]], label %[[FREM_ELSE20:.*]]
; CHECK: [[BB24:.*]]:
-; CHECK-NEXT: [[RET51:%.*]] = phi half [ [[TMP96:%.*]], %[[FREM_LOOP_EXIT61:.*]] ], [ [[TMP87:%.*]], %[[FREM_ELSE53]] ]
+; CHECK-NEXT: [[RET51:%.*]] = phi half [ [[TMP85:%.*]], %[[FREM_LOOP_EXIT28:.*]] ], [ [[TMP77:%.*]], %[[FREM_ELSE20]] ]
; CHECK-NEXT: [[TMP25:%.*]] = fcmp ueq half [[TMP22]], 0xH0000
; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], half 0xH7E00, half [[RET51]]
; CHECK-NEXT: [[TMP27:%.*]] = call half @llvm.fabs.f16(half [[TMP21]])
@@ -580,9 +580,9 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX82:%.*]] = fpext half [[AX80]] to float
; CHECK-NEXT: [[AY83:%.*]] = fpext half [[AY81]] to float
; CHECK-NEXT: [[TMP33:%.*]] = fcmp ogt float [[AX82]], [[AY83]]
-; CHECK-NEXT: br i1 [[TMP33]], label %[[FREM_COMPUTE85:.*]], label %[[FREM_ELSE86:.*]]
+; CHECK-NEXT: br i1 [[TMP33]], label %[[FREM_COMPUTE:.*]], label %[[FREM_ELSE:.*]]
; CHECK: [[BB34:.*]]:
-; CHECK-NEXT: [[RET84:%.*]] = phi half [ [[TMP115:%.*]], %[[FREM_LOOP_EXIT94:.*]] ], [ [[TMP106:%.*]], %[[FREM_ELSE86]] ]
+; CHECK-NEXT: [[RET84:%.*]] = phi half [ [[TMP66:%.*]], %[[FREM_LOOP_EXIT:.*]] ], [ [[TMP58:%.*]], %[[FREM_ELSE]] ]
; CHECK-NEXT: [[TMP35:%.*]] = fcmp ueq half [[TMP32]], 0xH0000
; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], half 0xH7E00, half [[RET84]]
; CHECK-NEXT: [[TMP37:%.*]] = call half @llvm.fabs.f16(half [[TMP31]])
@@ -592,12 +592,12 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: store <4 x half> [[R2]], ptr addrspace(1) [[OUT]], align 16
; CHECK-NEXT: ret void
; CHECK: [[FREM_COMPUTE]]:
-; CHECK-NEXT: [[TMP40:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX1]])
+; CHECK-NEXT: [[TMP40:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX82]])
; CHECK-NEXT: [[TMP41:%.*]] = extractvalue { float, i32 } [[TMP40]], 0
; CHECK-NEXT: [[TMP42:%.*]] = extractvalue { float, i32 } [[TMP40]], 1
; CHECK-NEXT: [[EX:%.*]] = sub i32 [[TMP42]], 1
; CHECK-NEXT: [[AX3:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP41]], i32 11)
-; CHECK-NEXT: [[TMP43:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY2]])
+; CHECK-NEXT: [[TMP43:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY83]])
; CHECK-NEXT: [[TMP44:%.*]] = extractvalue { float, i32 } [[TMP43]], 0
; CHECK-NEXT: [[TMP45:%.*]] = extractvalue { float, i32 } [[TMP43]], 1
; CHECK-NEXT: [[EY:%.*]] = sub i32 [[TMP45]], 1
@@ -607,10 +607,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[TMP46:%.*]] = icmp sgt i32 [[NB]], 11
; CHECK-NEXT: br i1 [[TMP46]], label %[[FREM_LOOP_BODY:.*]], label %[[FREM_LOOP_EXIT]]
; CHECK: [[FREM_ELSE]]:
-; CHECK-NEXT: [[TMP47:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP1]])
-; CHECK-NEXT: [[TMP48:%.*]] = fcmp oeq float [[AX1]], [[AY2]]
-; CHECK-NEXT: [[TMP49]] = select i1 [[TMP48]], half [[TMP47]], half [[TMP1]]
-; CHECK-NEXT: br label %[[BB4]]
+; CHECK-NEXT: [[TMP48:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP31]])
+; CHECK-NEXT: [[TMP49:%.*]] = fcmp oeq float [[AX82]], [[AY83]]
+; CHECK-NEXT: [[TMP58]] = select i1 [[TMP49]], half [[TMP48]], half [[TMP31]]
+; CHECK-NEXT: br label %[[BB34]]
; CHECK: [[FREM_LOOP_BODY]]:
; CHECK-NEXT: [[NB_IV:%.*]] = phi i32 [ [[NB]], %[[FREM_COMPUTE]] ], [ [[NB_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ]
; CHECK-NEXT: [[AX_LOOP_PHI:%.*]] = phi float [ [[AX3]], %[[FREM_COMPUTE]] ], [ [[AX_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ]
@@ -640,15 +640,15 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX12:%.*]] = select i1 [[CLT10]], float [[AXP11]], float [[AX9]]
; CHECK-NEXT: [[AX13:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX12]], i32 [[EY]])
; CHECK-NEXT: [[TMP57:%.*]] = fptrunc float [[AX13]] to half
-; CHECK-NEXT: [[TMP58]] = call half @llvm.copysign.f16(half [[TMP57]], half [[TMP1]])
-; CHECK-NEXT: br label %[[BB4]]
+; CHECK-NEXT: [[TMP66]] = call half @llvm.copysign.f16(half [[TMP57]], half [[TMP31]])
+; CHECK-NEXT: br label %[[BB34]]
; CHECK: [[FREM_COMPUTE19]]:
-; CHECK-NEXT: [[TMP59:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX16]])
+; CHECK-NEXT: [[TMP59:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX49]])
; CHECK-NEXT: [[TMP60:%.*]] = extractvalue { float, i32 } [[TMP59]], 0
; CHECK-NEXT: [[TMP61:%.*]] = extractvalue { float, i32 } [[TMP59]], 1
; CHECK-NEXT: [[EX21:%.*]] = sub i32 [[TMP61]], 1
; CHECK-NEXT: [[AX22:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP60]], i32 11)
-; CHECK-NEXT: [[TMP62:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY17]])
+; CHECK-NEXT: [[TMP62:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY50]])
; CHECK-NEXT: [[TMP63:%.*]] = extractvalue { float, i32 } [[TMP62]], 0
; CHECK-NEXT: [[TMP64:%.*]] = extractvalue { float, i32 } [[TMP62]], 1
; CHECK-NEXT: [[EY23:%.*]] = sub i32 [[TMP64]], 1
@@ -658,10 +658,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[TMP65:%.*]] = icmp sgt i32 [[NB25]], 11
; CHECK-NEXT: br i1 [[TMP65]], label %[[FREM_LOOP_BODY27:.*]], label %[[FREM_LOOP_EXIT28]]
; CHECK: [[FREM_ELSE20]]:
-; CHECK-NEXT: [[TMP66:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP11]])
-; CHECK-NEXT: [[TMP67:%.*]] = fcmp oeq float [[AX16]], [[AY17]]
-; CHECK-NEXT: [[TMP68]] = select i1 [[TMP67]], half [[TMP66]], half [[TMP11]]
-; CHECK-NEXT: br label %[[BB14]]
+; CHECK-NEXT: [[TMP67:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP21]])
+; CHECK-NEXT: [[TMP68:%.*]] = fcmp oeq float [[AX49]], [[AY50]]
+; CHECK-NEXT: [[TMP77]] = select i1 [[TMP68]], half [[TMP67]], half [[TMP21]]
+; CHECK-NEXT: br label %[[BB24]]
; CHECK: [[FREM_LOOP_BODY27]]:
; CHECK-NEXT: [[NB_IV29:%.*]] = phi i32 [ [[NB25]], %[[FREM_COMPUTE19]] ], [ [[NB_UPDATE37:%.*]], %[[FREM_LOOP_BODY27]] ]
; CHECK-NEXT: [[AX_LOOP_PHI30:%.*]] = phi float [ [[AX22]], %[[FREM_COMPUTE19]] ], [ [[AX_UPDATE36:%.*]], %[[FREM_LOOP_BODY27]] ]
@@ -691,15 +691,15 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX45:%.*]] = select i1 [[CLT43]], float [[AXP44]], float [[AX42]]
; CHECK-NEXT: [[AX46:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX45]], i32 [[EY23]])
; CHECK-NEXT: [[TMP76:%.*]] = fptrunc float [[AX46]] to half
-; CHECK-NEXT: [[TMP77]] = call half @llvm.copysign.f16(half [[TMP76]], half [[TMP11]])
-; CHECK-NEXT: br label %[[BB14]]
+; CHECK-NEXT: [[TMP85]] = call half @llvm.copysign.f16(half [[TMP76]], half [[TMP21]])
+; CHECK-NEXT: br label %[[BB24]]
; CHECK: [[FREM_COMPUTE52]]:
-; CHECK-NEXT: [[TMP78:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX49]])
+; CHECK-NEXT: [[TMP78:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX16]])
; CHECK-NEXT: [[TMP79:%.*]] = extractvalue { float, i32 } [[TMP78]], 0
; CHECK-NEXT: [[TMP80:%.*]] = extractvalue { float, i32 } [[TMP78]], 1
; CHECK-NEXT: [[EX54:%.*]] = sub i32 [[TMP80]], 1
; CHECK-NEXT: [[AX55:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP79]], i32 11)
-; CHECK-NEXT: [[TMP81:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY50]])
+; CHECK-NEXT: [[TMP81:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY17]])
; CHECK-NEXT: [[TMP82:%.*]] = extractvalue { float, i32 } [[TMP81]], 0
; CHECK-NEXT: [[TMP83:%.*]] = extractvalue { float, i32 } [[TMP81]], 1
; CHECK-NEXT: [[EY56:%.*]] = sub i32 [[TMP83]], 1
@@ -709,10 +709,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[TMP84:%.*]] = icmp sgt i32 [[NB58]], 11
; CHECK-NEXT: br i1 [[TMP84]], label %[[FREM_LOOP_BODY60:.*]], label %[[FREM_LOOP_EXIT61]]
; CHECK: [[FREM_ELSE53]]:
-; CHECK-NEXT: [[TMP85:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP21]])
-; CHECK-NEXT: [[TMP86:%.*]] = fcmp oeq float [[AX49]], [[AY50]]
-; CHECK-NEXT: [[TMP87]] = select i1 [[TMP86]], half [[TMP85]], half [[TMP21]]
-; CHECK-NEXT: br label %[[BB24]]
+; CHECK-NEXT: [[TMP86:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP11]])
+; CHECK-NEXT: [[TMP87:%.*]] = fcmp oeq float [[AX16]], [[AY17]]
+; CHECK-NEXT: [[TMP96]] = select i1 [[TMP87]], half [[TMP86]], half [[TMP11]]
+; CHECK-NEXT: br label %[[BB14]]
; CHECK: [[FREM_LOOP_BODY60]]:
; CHECK-NEXT: [[NB_IV62:%.*]] = phi i32 [ [[NB58]], %[[FREM_COMPUTE52]] ], [ [[NB_UPDATE70:%.*]], %[[FREM_LOOP_BODY60]] ]
; CHECK-NEXT: [[AX_LOOP_PHI63:%.*]] = phi float [ [[AX55]], %[[FREM_COMPUTE52]] ], [ [[AX_UPDATE69:%.*]], %[[FREM_LOOP_BODY60]] ]
@@ -742,15 +742,15 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX78:%.*]] = select i1 [[CLT76]], float [[AXP77]], float [[AX75]]
; CHECK-NEXT: [[AX79:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX78]], i32 [[EY56]])
; CHECK-NEXT: [[TMP95:%.*]] = fptrunc float [[AX79]] to half
-; CHECK-NEXT: [[TMP96]] = call half @llvm.copysign.f16(half [[TMP95]], half [[TMP21]])
-; CHECK-NEXT: br label %[[BB24]]
+; CHECK-NEXT: [[TMP104]] = call half @llvm.copysign.f16(half [[TMP95]], half [[TMP11]])
+; CHECK-NEXT: br label %[[BB14]]
; CHECK: [[FREM_COMPUTE85]]:
-; CHECK-NEXT: [[TMP97:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX82]])
+; CHECK-NEXT: [[TMP97:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX1]])
; CHECK-NEXT: [[TMP98:%.*]] = extractvalue { float, i32 } [[TMP97]], 0
; CHECK-NEXT: [[TMP99:%.*]] = extractvalue { float, i32 } [[TMP97]], 1
; CHECK-NEXT: [[EX87:%.*]] = sub i32 [[TMP99]], 1
; CHECK-NEXT: [[AX88:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP98]], i32 11)
-; CHECK-NEXT: [[TMP100:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY83]])
+; CHECK-NEXT: [[TMP100:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY2]])
; CHECK-NEXT: [[TMP101:%.*]] = extractvalue { float, i32 } [[TMP100]], 0
; CHECK-NEXT: [[TMP102:%.*]] = extractvalue { float, i32 } [[TMP100]], 1
; CHECK-NEXT: [[EY89:%.*]] = sub i32 [[TMP102]], 1
@@ -760,10 +760,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[TMP103:%.*]] = icmp sgt i32 [[NB91]], 11
; CHECK-NEXT: br i1 [[TMP103]], label %[[FREM_LOOP_BODY93:.*]], label %[[FREM_LOOP_EXIT94]]
; CHECK: [[FREM_ELSE86]]:
-; CHECK-NEXT: [[TMP104:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP31]])
-; CHECK-NEXT: [[TMP105:%.*]] = fcmp oeq float [[AX82]], [[AY83]]
-; CHECK-NEXT: [[TMP106]] = select i1 [[TMP105]], half [[TMP104]], half [[TMP31]]
-; CHECK-NEXT: br label %[[BB34]]
+; CHECK-NEXT: [[TMP105:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP1]])
+; CHECK-NEXT: [[TMP106:%.*]] = fcmp oeq float [[AX1]], [[AY2]]
+; CHECK-NEXT: [[TMP115]] = select i1 [[TMP106]], half [[TMP105]], half [[TMP1]]
+; CHECK-NEXT: br label %[[BB4]]
; CHECK: [[FREM_LOOP_BODY93]]:
; CHECK-NEXT: [[NB_IV95:%.*]] = phi i32 [ [[NB91]], %[[FREM_COMPUTE85]] ], [ [[NB_UPDATE103:%.*]], %[[FREM_LOOP_BODY93]] ]
; CHECK-NEXT: [[AX_LOOP_PHI96:%.*]] = phi float [ [[AX88]], %[[FREM_COMPUTE85]] ], [ [[AX_UPDATE102:%.*]], %[[FREM_LOOP_BODY93]] ]
@@ -793,8 +793,8 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX111:%.*]] = select i1 [[CLT109]], float [[AXP110]], float [[AX108]]
; CHECK-NEXT: [[AX112:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX111]], i32 [[EY89]])
; CHECK-NEXT: [[TMP114:%.*]] = fptrunc float [[AX112]] to half
-; CHECK-NEXT: [[TMP115]] = call half @llvm.copysign.f16(half [[TMP114]], half [[TMP31]])
-; CHECK-NEXT: br label %[[BB34]]
+; CHECK-NEXT: [[TMP116]] = call half @llvm.copysign.f16(half [[TMP114]], half [[TMP1]])
+; CHECK-NEXT: br label %[[BB4]]
;
ptr addrspace(1) %in2) {
%gep2 = getelementptr <4 x half>, ptr addrspace(1) %in2, i32 4
@@ -816,9 +816,9 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX:%.*]] = call float @llvm.fabs.f32(float [[TMP1]])
; CHECK-NEXT: [[AY:%.*]] = call float @llvm.fabs.f32(float [[TMP2]])
; CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt float [[AX]], [[AY]]
-; CHECK-NEXT: br i1 [[TMP3]], label %[[FREM_COMPUTE:.*]], label %[[FREM_ELSE:.*]]
+; CHECK-NEXT: br i1 [[TMP3]], label %[[FREM_COMPUTE15:.*]], label %[[FREM_ELSE16:.*]]
; CHECK: [[BB4:.*]]:
-; CHECK-NEXT: [[RET:%.*]] = phi float [ [[TMP37:%.*]], %[[FREM_LOOP_EXIT:.*]] ], [ [[TMP29:%.*]], %[[FREM_ELSE]] ]
+; CHECK-NEXT: [[RET:%.*]] = phi float [ [[TMP56:%.*]], %[[FREM_LOOP_EXIT24:.*]] ], [ [[TMP55:%.*]], %[[FREM_ELSE16]] ]
; CHECK-NEXT: [[TMP5:%.*]] = fcmp ueq float [[TMP2]], 0.000000e+00
; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], float 0x7FF8000000000000, float [[RET]]
; CHECK-NEXT: [[TMP7:%.*]] = call float @llvm.fabs.f32(float [[TMP1]])
@@ -830,9 +830,9 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX12:%.*]] = call float @llvm.fabs.f32(float [[TMP11]])
; CHECK-NEXT: [[AY13:%.*]] = call float @llvm.fabs.f32(float [[TMP12]])
; CHECK-NEXT: [[TMP13:%.*]] = fcmp ogt float [[AX12]], [[AY13]]
-; CHECK-NEXT: br i1 [[TMP13]], label %[[FREM_COMPUTE15:.*]], label %[[FREM_ELSE16:.*]]
+; CHECK-NEXT: br i1 [[TMP13]], label %[[FREM_COMPUTE:.*]], label %[[FREM_ELSE:.*]]
; CHECK: [[BB14:.*]]:
-; CHECK-NEXT: [[RET14:%.*]] = phi float [ [[TMP55:%.*]], %[[FREM_LOOP_EXIT24:.*]] ], [ [[TMP47:%.*]], %[[FREM_ELSE16]] ]
+; CHECK-NEXT: [[RET14:%.*]] = phi float [ [[TMP45:%.*]], %[[FREM_LOOP_EXIT:.*]] ], [ [[TMP37:%.*]], %[[FREM_ELSE]] ]
; CHECK-NEXT: [[TMP15:%.*]] = fcmp ueq float [[TMP12]], 0.000000e+00
; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], float 0x7FF8000000000000, float [[RET14]]
; CHECK-NEXT: [[TMP17:%.*]] = call float @llvm.fabs.f32(float [[TMP11]])
@@ -842,12 +842,12 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: store <2 x float> [[R2]], ptr addrspace(1) [[OUT]], align 8
; CHECK-NEXT: ret void
; CHECK: [[FREM_COMPUTE]]:
-; CHECK-NEXT: [[TMP20:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX]])
+; CHECK-NEXT: [[TMP20:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX12]])
; CHECK-NEXT: [[TMP21:%.*]] = extractvalue { float, i32 } [[TMP20]], 0
; CHECK-NEXT: [[TMP22:%.*]] = extractvalue { float, i32 } [[TMP20]], 1
; CHECK-NEXT: [[EX:%.*]] = sub i32 [[TMP22]], 1
; CHECK-NEXT: [[AX1:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP21]], i32 12)
-; CHECK-NEXT: [[TMP23:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY]])
+; CHECK-NEXT: [[TMP23:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY13]])
; CHECK-NEXT: [[TMP24:%.*]] = extractvalue { float, i32 } [[TMP23]], 0
; CHECK-NEXT: [[TMP25:%.*]] = extractvalue { float, i32 } [[TMP23]], 1
; CHECK-NEXT: [[EY:%.*]] = sub i32 [[TMP25]], 1
@@ -857,10 +857,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[TMP26:%.*]] = icmp sgt i32 [[NB]], 12
; CHECK-NEXT: br i1 [[TMP26]], label %[[FREM_LOOP_BODY:.*]], label %[[FREM_LOOP_EXIT]]
; CHECK: [[FREM_ELSE]]:
-; CHECK-NEXT: [[TMP27:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP1]])
-; CHECK-NEXT: [[TMP28:%.*]] = fcmp oeq float [[AX]], [[AY]]
-; CHECK-NEXT: [[TMP29]] = select i1 [[TMP28]], float [[TMP27]], float [[TMP1]]
-; CHECK-NEXT: br label %[[BB4]]
+; CHECK-NEXT: [[TMP28:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP11]])
+; CHECK-NEXT: [[TMP29:%.*]] = fcmp oeq float [[AX12]], [[AY13]]
+; CHECK-NEXT: [[TMP37]] = select i1 [[TMP29]], float [[TMP28]], float [[TMP11]]
+; CHECK-NEXT: br label %[[BB14]]
; CHECK: [[FREM_LOOP_BODY]]:
; CHECK-NEXT: [[NB_IV:%.*]] = phi i32 [ [[NB]], %[[FREM_COMPUTE]] ], [ [[NB_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ]
; CHECK-NEXT: [[AX_LOOP_PHI:%.*]] = phi float [ [[AX1]], %[[FREM_COMPUTE]] ], [ [[AX_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ]
@@ -889,15 +889,15 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AXP9:%.*]] = fadd float [[AX7]], [[AY2]]
; CHECK-NEXT: [[AX10:%.*]] = select i1 [[CLT8]], float [[AXP9]], float [[AX7]]
; CHECK-NEXT: [[AX11:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX10]], i32 [[EY]])
-; CHECK-NEXT: [[TMP37]] = call float @llvm.copysign.f32(float [[AX11]], float [[TMP1]])
-; CHECK-NEXT: br label %[[BB4]]
+; CHECK-NEXT: [[TMP45]] = call float @llvm.copysign.f32(float [[AX11]], float [[TMP11]])
+; CHECK-NEXT: br label %[[BB14]]
; CHECK: [[FREM_COMPUTE15]]:
-; CHECK-NEXT: [[TMP38:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX12]])
+; CHECK-NEXT: [[TMP38:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX]])
; CHECK-NEXT: [[TMP39:%.*]] = extractvalue { float, i32 } [[TMP38]], 0
; CHECK-NEXT: [[TMP40:%.*]] = extractvalue { float, i32 } [[TMP38]], 1
; CHECK-NEXT: [[EX17:%.*]] = sub i32 [[TMP40]], 1
; CHECK-NEXT: [[AX18:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP39]], i32 12)
-; CHECK-NEXT: [[TMP41:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY13]])
+; CHECK-NEXT: [[TMP41:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY]])
; CHECK-NEXT: [[TMP42:%.*]] = extractvalue { float, i32 } [[TMP41]], 0
; CHECK-NEXT: [[TMP43:%.*]] = extractvalue { float, i32 } [[TMP41]], 1
; CHECK-NEXT: [[EY19:%.*]] = sub i32 [[TMP43]], 1
@@ -907,10 +907,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[TMP44:%.*]] = icmp sgt i32 [[NB21]], 12
; CHECK-NEXT: br i1 [[TMP44]], label %[[FREM_LOOP_BODY23:.*]], label %[[FREM_LOOP_EXIT24]]
; CHECK: [[FREM_ELSE16]]:
-; CHECK-NEXT: [[TMP45:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP11]])
-; CHECK-NEXT: [[TMP46:%.*]] = fcmp oeq float [[AX12]], [[AY13]]
-; CHECK-NEXT: [[TMP47]] = select i1 [[TMP46]], float [[TMP45]], float [[TMP11]]
-; CHECK-NEXT: br label %[[BB14]]
+; CHECK-NEXT: [[TMP46:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP1]])
+; CHECK-NEXT: [[TMP47:%.*]] = fcmp oeq float [[AX]], [[AY]]
+; CHECK-NEXT: [[TMP55]] = select i1 [[TMP47]], float [[TMP46]], float [[TMP1]]
+; CHECK-NEXT: br label %[[BB4]]
; CHECK: [[FREM_LOOP_BODY23]]:
; CHECK-NEXT: [[NB_IV25:%.*]] = phi i32 [ [[NB21]], %[[FREM_COMPUTE15]] ], [ [[NB_UPDATE33:%.*]], %[[FREM_LOOP_BODY23]] ]
; CHECK-NEXT: [[AX_LOOP_PHI26:%.*]] = phi float [ [[AX18]], %[[FREM_COMPUTE15]] ], [ [[AX_UPDATE32:%.*]], %[[FREM_LOOP_BODY23]] ]
@@ -939,8 +939,8 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AXP40:%.*]] = fadd float [[AX38]], [[AY20]]
; CHECK-NEXT: [[AX41:%.*]] = select i1 [[CLT39]], float [[AXP40]], float [[AX38]]
; CHECK-NEXT: [[AX42:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX41]], i32 [[EY19]])
-; CHECK-NEXT: [[TMP55]] = call float @llvm.copysign.f32(float [[AX42]], float [[TMP11]])
-; CHECK-NEXT: br label %[[BB14]]
+; CHECK-NEXT: [[TMP56]] = call float @llvm.copysign.f32(float [[AX42]], float [[TMP1]])
+; CHECK-NEXT: br label %[[BB4]]
;
ptr addrspace(1) %in2) {
%gep2 = getelementptr <2 x float>, ptr addrspace(1) %in2, i32 4
@@ -962,9 +962,9 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX:%.*]] = call float @llvm.fabs.f32(float [[TMP1]])
; CHECK-NEXT: [[AY:%.*]] = call float @llvm.fabs.f32(float [[TMP2]])
; CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt float [[AX]], [[AY]]
-; CHECK-NEXT: br i1 [[TMP3]], label %[[FREM_COMPUTE:.*]], label %[[FREM_ELSE:.*]]
+; CHECK-NEXT: br i1 [[TMP3]], label %[[FREM_COMPUTE77:.*]], label %[[FREM_ELSE78:.*]]
; CHECK: [[BB4:.*]]:
-; CHECK-NEXT: [[RET:%.*]] = phi float [ [[TMP57:%.*]], %[[FREM_LOOP_EXIT:.*]] ], [ [[TMP49:%.*]], %[[FREM_ELSE]] ]
+; CHECK-NEXT: [[RET:%.*]] = phi float [ [[TMP112:%.*]], %[[FREM_LOOP_EXIT86:.*]] ], [ [[TMP111:%.*]], %[[FREM_ELSE78]] ]
; CHECK-NEXT: [[TMP5:%.*]] = fcmp ueq float [[TMP2]], 0.000000e+00
; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], float 0x7FF8000000000000, float [[RET]]
; CHECK-NEXT: [[TMP7:%.*]] = call float @llvm.fabs.f32(float [[TMP1]])
@@ -976,9 +976,9 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX12:%.*]] = call float @llvm.fabs.f32(float [[TMP11]])
; CHECK-NEXT: [[AY13:%.*]] = call float @llvm.fabs.f32(float [[TMP12]])
; CHECK-NEXT: [[TMP13:%.*]] = fcmp ogt float [[AX12]], [[AY13]]
-; CHECK-NEXT: br i1 [[TMP13]], label %[[FREM_COMPUTE15:.*]], label %[[FREM_ELSE16:.*]]
+; CHECK-NEXT: br i1 [[TMP13]], label %[[FREM_COMPUTE46:.*]], label %[[FREM_ELSE47:.*]]
; CHECK: [[BB14:.*]]:
-; CHECK-NEXT: [[RET14:%.*]] = phi float [ [[TMP75:%.*]], %[[FREM_LOOP_EXIT24:.*]] ], [ [[TMP67:%.*]], %[[FREM_ELSE16]] ]
+; CHECK-NEXT: [[RET14:%.*]] = phi float [ [[TMP101:%.*]], %[[FREM_LOOP_EXIT55:.*]] ], [ [[TMP93:%.*]], %[[FREM_ELSE47]] ]
; CHECK-NEXT: [[TMP15:%.*]] = fcmp ueq float [[TMP12]], 0.000000e+00
; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], float 0x7FF8000000000000, float [[RET14]]
; CHECK-NEXT: [[TMP17:%.*]] = call float @llvm.fabs.f32(float [[TMP11]])
@@ -990,9 +990,9 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX43:%.*]] = call float @llvm.fabs.f32(float [[TMP21]])
; CHECK-NEXT: [[AY44:%.*]] = call float @llvm.fabs.f32(float [[TMP22]])
; CHECK-NEXT: [[TMP23:%.*]] = fcmp ogt float [[AX43]], [[AY44]]
-; CHECK-NEXT: br i1 [[TMP23]], label %[[FREM_COMPUTE46:.*]], label %[[FREM_ELSE47:.*]]
+; CHECK-NEXT: br i1 [[TMP23]], label %[[FREM_COMPUTE15:.*]], label %[[FREM_ELSE16:.*]]
; CHECK: [[BB24:.*]]:
-; CHECK-NEXT: [[RET45:%.*]] = phi float [ [[TMP93:%.*]], %[[FREM_LOOP_EXIT55:.*]] ], [ [[TMP85:%.*]], %[[FREM_ELSE47]] ]
+; CHECK-NEXT: [[RET45:%.*]] = phi float [ [[TMP83:%.*]], %[[FREM_LOOP_EXIT24:.*]] ], [ [[TMP75:%.*]], %[[FREM_ELSE16]] ]
; CHECK-NEXT: [[TMP25:%.*]] = fcmp ueq float [[TMP22]], 0.000000e+00
; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], float 0x7FF8000000000000, float [[RET45]]
; CHECK-NEXT: [[TMP27:%.*]] = call float @llvm.fabs.f32(float [[TMP21]])
@@ -1004,9 +1004,9 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX74:%.*]] = call float @llvm.fabs.f32(float [[TMP31]])
; CHECK-NEXT: [[AY75:%.*]] = call float @llvm.fabs.f32(float [[TMP32]])
; CHECK-NEXT: [[TMP33:%.*]] = fcmp ogt float [[AX74]], [[AY75]]
-; CHECK-NEXT: br i1 [[TMP33]], label %[[FREM_COMPUTE77:.*]], label %[[FREM_ELSE78:.*]]
+; CHECK-NEXT: br i1 [[TMP33]], label %[[FREM_COMPUTE:.*]], label %[[FREM_ELSE:.*]]
; CHECK: [[BB34:.*]]:
-; CHECK-NEXT: [[RET76:%.*]] = phi float [ [[TMP111:%.*]], %[[FREM_LOOP_EXIT86:.*]] ], [ [[TMP103:%.*]], %[[FREM_ELSE78]] ]
+; CHECK-NEXT: [[RET76:%.*]] = phi float [ [[TMP65:%.*]], %[[FREM_LOOP_EXIT:.*]] ], [ [[TMP57:%.*]], %[[FREM_ELSE]] ]
; CHECK-NEXT: [[TMP35:%.*]] = fcmp ueq float [[TMP32]], 0.000000e+00
; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], float 0x7FF8000000000000, float [[RET76]]
; CHECK-NEXT: [[TMP37:%.*]] = call float @llvm.fabs.f32(float [[TMP31]])
@@ -1016,12 +1016,12 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: store <4 x float> [[R2]], ptr addrspace(1) [[OUT]], align 16
; CHECK-NEXT: ret void
; CHECK: [[FREM_COMPUTE]]:
-; CHECK-NEXT: [[TMP40:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX]])
+; CHECK-NEXT: [[TMP40:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX74]])
; CHECK-NEXT: [[TMP41:%.*]] = extractvalue { float, i32 } [[TMP40]], 0
; CHECK-NEXT: [[TMP42:%.*]] = extractvalue { float, i32 } [[TMP40]], 1
; CHECK-NEXT: [[EX:%.*]] = sub i32 [[TMP42]], 1
; CHECK-NEXT: [[AX1:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP41]], i32 12)
-; CHECK-NEXT: [[TMP43:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY]])
+; CHECK-NEXT: [[TMP43:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY75]])
; CHECK-NEXT: [[TMP44:%.*]] = extractvalue { float, i32 } [[TMP43]], 0
; CHECK-NEXT: [[TMP45:%.*]] = extractvalue { float, i32 } [[TMP43]], 1
; CHECK-NEXT: [[EY:%.*]] = sub i32 [[TMP45]], 1
@@ -1031,10 +1031,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[TMP46:%.*]] = icmp sgt i32 [[NB]], 12
; CHECK-NEXT: br i1 [[TMP46]], label %[[FREM_LOOP_BODY:.*]], label %[[FREM_LOOP_EXIT]]
; CHECK: [[FREM_ELSE]]:
-; CHECK-NEXT: [[TMP47:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP1]])
-; CHECK-NEXT: [[TMP48:%.*]] = fcmp oeq float [[AX]], [[AY]]
-; CHECK-NEXT: [[TMP49]] = select i1 [[TMP48]], float [[TMP47]], float [[TMP1]]
-; CHECK-NEXT: br label %[[BB4]]
+; CHECK-NEXT: [[TMP48:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP31]])
+; CHECK-NEXT: [[TMP49:%.*]] = fcmp oeq float [[AX74]], [[AY75]]
+; CHECK-NEXT: [[TMP57]] = select i1 [[TMP49]], float [[TMP48]], float [[TMP31]]
+; CHECK-NEXT: br label %[[BB34]]
; CHECK: [[FREM_LOOP_BODY]]:
; CHECK-NEXT: [[NB_IV:%.*]] = phi i32 [ [[NB]], %[[FREM_COMPUTE]] ], [ [[NB_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ]
; CHECK-NEXT: [[AX_LOOP_PHI:%.*]] = phi float [ [[AX1]], %[[FREM_COMPUTE]] ], [ [[AX_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ]
@@ -1063,15 +1063,15 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AXP9:%.*]] = fadd float [[AX7]], [[AY2]]
; CHECK-NEXT: [[AX10:%.*]] = select i1 [[CLT8]], float [[AXP9]], float [[AX7]]
; CHECK-NEXT: [[AX11:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX10]], i32 [[EY]])
-; CHECK-NEXT: [[TMP57]] = call float @llvm.copysign.f32(float [[AX11]], float [[TMP1]])
-; CHECK-NEXT: br label %[[BB4]]
+; CHECK-NEXT: [[TMP65]] = call float @llvm.copysign.f32(float [[AX11]], float [[TMP31]])
+; CHECK-NEXT: br label %[[BB34]]
; CHECK: [[FREM_COMPUTE15]]:
-; CHECK-NEXT: [[TMP58:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX12]])
+; CHECK-NEXT: [[TMP58:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX43]])
; CHECK-NEXT: [[TMP59:%.*]] = extractvalue { float, i32 } [[TMP58]], 0
; CHECK-NEXT: [[TMP60:%.*]] = extractvalue { float, i32 } [[TMP58]], 1
; CHECK-NEXT: [[EX17:%.*]] = sub i32 [[TMP60]], 1
; CHECK-NEXT: [[AX18:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP59]], i32 12)
-; CHECK-NEXT: [[TMP61:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY13]])
+; CHECK-NEXT: [[TMP61:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY44]])
; CHECK-NEXT: [[TMP62:%.*]] = extractvalue { float, i32 } [[TMP61]], 0
; CHECK-NEXT: [[TMP63:%.*]] = extractvalue { float, i32 } [[TMP61]], 1
; CHECK-NEXT: [[EY19:%.*]] = sub i32 [[TMP63]], 1
@@ -1081,10 +1081,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[TMP64:%.*]] = icmp sgt i32 [[NB21]], 12
; CHECK-NEXT: br i1 [[TMP64]], label %[[FREM_LOOP_BODY23:.*]], label %[[FREM_LOOP_EXIT24]]
; CHECK: [[FREM_ELSE16]]:
-; CHECK-NEXT: [[TMP65:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP11]])
-; CHECK-NEXT: [[TMP66:%.*]] = fcmp oeq float [[AX12]], [[AY13]]
-; CHECK-NEXT: [[TMP67]] = select i1 [[TMP66]], float [[TMP65]], float [[TMP11]]
-; CHECK-NEXT: br label %[[BB14]]
+; CHECK-NEXT: [[TMP66:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP21]])
+; CHECK-NEXT: [[TMP67:%.*]] = fcmp oeq float [[AX43]], [[AY44]]
+; CHECK-NEXT: [[TMP75]] = select i1 [[TMP67]], float [[TMP66]], float [[TMP21]]
+; CHECK-NEXT: br label %[[BB24]]
; CHECK: [[FREM_LOOP_BODY23]]:
; CHECK-NEXT: [[NB_IV25:%.*]] = phi i32 [ [[NB21]], %[[FREM_COMPUTE15]] ], [ [[NB_UPDATE33:%.*]], %[[FREM_LOOP_BODY23]] ]
; CHECK-NEXT: [[AX_LOOP_PHI26:%.*]] = phi float [ [[AX18]], %[[FREM_COMPUTE15]] ], [ [[AX_UPDATE32:%.*]], %[[FREM_LOOP_BODY23]] ]
@@ -1113,15 +1113,15 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AXP40:%.*]] = fadd float [[AX38]], [[AY20]]
; CHECK-NEXT: [[AX41:%.*]] = select i1 [[CLT39]], float [[AXP40]], float [[AX38]]
; CHECK-NEXT: [[AX42:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX41]], i32 [[EY19]])
-; CHECK-NEXT: [[TMP75]] = call float @llvm.copysign.f32(float [[AX42]], float [[TMP11]])
-; CHECK-NEXT: br label %[[BB14]]
+; CHECK-NEXT: [[TMP83]] = call float @llvm.copysign.f32(float [[AX42]], float [[TMP21]])
+; CHECK-NEXT: br label %[[BB24]]
; CHECK: [[FREM_COMPUTE46]]:
-; CHECK-NEXT: [[TMP76:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX43]])
+; CHECK-NEXT: [[TMP76:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX12]])
; CHECK-NEXT: [[TMP77:%.*]] = extractvalue { float, i32 } [[TMP76]], 0
; CHECK-NEXT: [[TMP78:%.*]] = extractvalue { float, i32 } [[TMP76]], 1
; CHECK-NEXT: [[EX48:%.*]] = sub i32 [[TMP78]], 1
; CHECK-NEXT: [[AX49:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP77]], i32 12)
-; CHECK-NEXT: [[TMP79:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY44]])
+; CHECK-NEXT: [[TMP79:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY13]])
; CHECK-NEXT: [[TMP80:%.*]] = extractvalue { float, i32 } [[TMP79]], 0
; CHECK-NEXT: [[TMP81:%.*]] = extractvalue { float, i32 } [[TMP79]], 1
; CHECK-NEXT: [[EY50:%.*]] = sub i32 [[TMP81]], 1
@@ -1131,10 +1131,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[TMP82:%.*]] = icmp sgt i32 [[NB52]], 12
; CHECK-NEXT: br i1 [[TMP82]], label %[[FREM_LOOP_BODY54:.*]], label %[[FREM_LOOP_EXIT55]]
; CHECK: [[FREM_ELSE47]]:
-; CHECK-NEXT: [[TMP83:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP21]])
-; CHECK-NEXT: [[TMP84:%.*]] = fcmp oeq float [[AX43]], [[AY44]]
-; CHECK-NEXT: [[TMP85]] = select i1 [[TMP84]], float [[TMP83]], float [[TMP21]]
-; CHECK-NEXT: br label %[[BB24]]
+; CHECK-NEXT: [[TMP84:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP11]])
+; CHECK-NEXT: [[TMP85:%.*]] = fcmp oeq float [[AX12]], [[AY13]]
+; CHECK-NEXT: [[TMP93]] = select i1 [[TMP85]], float [[TMP84]], float [[TMP11]]
+; CHECK-NEXT: br label %[[BB14]]
; CHECK: [[FREM_LOOP_BODY54]]:
; CHECK-NEXT: [[NB_IV56:%.*]] = phi i32 [ [[NB52]], %[[FREM_COMPUTE46]] ], [ [[NB_UPDATE64:%.*]], %[[FREM_LOOP_BODY54]] ]
; CHECK-NEXT: [[AX_LOOP_PHI57:%.*]] = phi float [ [[AX49]], %[[FREM_COMPUTE46]] ], [ [[AX_UPDATE63:%.*]], %[[FREM_LOOP_BODY54]] ]
@@ -1163,15 +1163,15 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AXP71:%.*]] = fadd float [[AX69]], [[AY51]]
; CHECK-NEXT: [[AX72:%.*]] = select i1 [[CLT70]], float [[AXP71]], float [[AX69]]
; CHECK-NEXT: [[AX73:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX72]], i32 [[EY50]])
-; CHECK-NEXT: [[TMP93]] = call float @llvm.copysign.f32(float [[AX73]], float [[TMP21]])
-; CHECK-NEXT: br label %[[BB24]]
+; CHECK-NEXT: [[TMP101]] = call float @llvm.copysign.f32(float [[AX73]], float [[TMP11]])
+; CHECK-NEXT: br label %[[BB14]]
; CHECK: [[FREM_COMPUTE77]]:
-; CHECK-NEXT: [[TMP94:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX74]])
+; CHECK-NEXT: [[TMP94:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX]])
; CHECK-NEXT: [[TMP95:%.*]] = extractvalue { float, i32 } [[TMP94]], 0
; CHECK-NEXT: [[TMP96:%.*]] = extractvalue { float, i32 } [[TMP94]], 1
; CHECK-NEXT: [[EX79:%.*]] = sub i32 [[TMP96]], 1
; CHECK-NEXT: [[AX80:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP95]], i32 12)
-; CHECK-NEXT: [[TMP97:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY75]])
+; CHECK-NEXT: [[TMP97:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY]])
; CHECK-NEXT: [[TMP98:%.*]] = extractvalue { float, i32 } [[TMP97]], 0
; CHECK-NEXT: [[TMP99:%.*]] = extractvalue { float, i32 } [[TMP97]], 1
; CHECK-NEXT: [[EY81:%.*]] = sub i32 [[TMP99]], 1
@@ -1181,10 +1181,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[TMP100:%.*]] = icmp sgt i32 [[NB83]], 12
; CHECK-NEXT: br i1 [[TMP100]], label %[[FREM_LOOP_BODY85:.*]], label %[[FREM_LOOP_EXIT86]]
; CHECK: [[FREM_ELSE78]]:
-; CHECK-NEXT: [[TMP101:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP31]])
-; CHECK-NEXT: [[TMP102:%.*]] = fcmp oeq float [[AX74]], [[AY75]]
-; CHECK-NEXT: [[TMP103]] = select i1 [[TMP102]], float [[TMP101]], float [[TMP31]]
-; CHECK-NEXT: br label %[[BB34]]
+; CHECK-NEXT: [[TMP102:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP1]])
+; CHECK-NEXT: [[TMP103:%.*]] = fcmp oeq float [[AX]], [[AY]]
+; CHECK-NEXT: [[TMP111]] = select i1 [[TMP103]], float [[TMP102]], float [[TMP1]]
+; CHECK-NEXT: br label %[[BB4]]
; CHECK: [[FREM_LOOP_BODY85]]:
; CHECK-NEXT: [[NB_IV87:%.*]] = phi i32 [ [[NB83]], %[[FREM_COMPUTE77]] ], [ [[NB_UPDATE95:%.*]], %[[FREM_LOOP_BODY85]] ]
; CHECK-NEXT: [[AX_LOOP_PHI88:%.*]] = phi float [ [[AX80]], %[[FREM_COMPUTE77]] ], [ [[AX_UPDATE94:%.*]], %[[FREM_LOOP_BODY85]] ]
@@ -1213,8 +1213,8 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AXP102:%.*]] = fadd float [[AX100]], [[AY82]]
; CHECK-NEXT: [[AX103:%.*]] = select i1 [[CLT101]], float [[AXP102]], float [[AX100]]
; CHECK-NEXT: [[AX104:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX103]], i32 [[EY81]])
-; CHECK-NEXT: [[TMP111]] = call float @llvm.copysign.f32(float [[AX104]], float [[TMP31]])
-; CHECK-NEXT: br label %[[BB34]]
+; CHECK-NEXT: [[TMP112]] = call float @llvm.copysign.f32(float [[AX104]], float [[TMP1]])
+; CHECK-NEXT: br label %[[BB4]]
;
ptr addrspace(1) %in2) {
%gep2 = getelementptr <4 x float>, ptr addrspace(1) %in2, i32 4
@@ -1236,9 +1236,9 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX:%.*]] = call double @llvm.fabs.f64(double [[TMP1]])
; CHECK-NEXT: [[AY:%.*]] = call double @llvm.fabs.f64(double [[TMP2]])
; CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt double [[AX]], [[AY]]
-; CHECK-NEXT: br i1 [[TMP3]], label %[[FREM_COMPUTE:.*]], label %[[FREM_ELSE:.*]]
+; CHECK-NEXT: br i1 [[TMP3]], label %[[FREM_COMPUTE15:.*]], label %[[FREM_ELSE16:.*]]
; CHECK: [[BB4:.*]]:
-; CHECK-NEXT: [[RET:%.*]] = phi double [ [[TMP37:%.*]], %[[FREM_LOOP_EXIT:.*]] ], [ [[TMP29:%.*]], %[[FREM_ELSE]] ]
+; CHECK-NEXT: [[RET:%.*]] = phi double [ [[TMP56:%.*]], %[[FREM_LOOP_EXIT24:.*]] ], [ [[TMP55:%.*]], %[[FREM_ELSE16]] ]
; CHECK-NEXT: [[TMP5:%.*]] = fcmp ueq double [[TMP2]], 0.000000e+00
; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], double 0x7FF8000000000000, double [[RET]]
; CHECK-NEXT: [[TMP7:%.*]] = call double @llvm.fabs.f64(double [[TMP1]])
@@ -1250,9 +1250,9 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX12:%.*]] = call double @llvm.fabs.f64(double [[TMP11]])
; CHECK-NEXT: [[AY13:%.*]] = call double @llvm.fabs.f64(double [[TMP12]])
; CHECK-NEXT: [[TMP13:%.*]] = fcmp ogt double [[AX12]], [[AY13]]
-; CHECK-NEXT: br i1 [[TMP13]], label %[[FREM_COMPUTE15:.*]], label %[[FREM_ELSE16:.*]]
+; CHECK-NEXT: br i1 [[TMP13]], label %[[FREM_COMPUTE:.*]], label %[[FREM_ELSE:.*]]
; CHECK: [[BB14:.*]]:
-; CHECK-NEXT: [[RET14:%.*]] = phi double [ [[TMP55:%.*]], %[[FREM_LOOP_EXIT24:.*]] ], [ [[TMP47:%.*]], %[[FREM_ELSE16]] ]
+; CHECK-NEXT: [[RET14:%.*]] = phi double [ [[TMP45:%.*]], %[[FREM_LOOP_EXIT:.*]] ], [ [[TMP37:%.*]], %[[FREM_ELSE]] ]
; CHECK-NEXT: [[TMP15:%.*]] = fcmp ueq double [[TMP12]], 0.000000e+00
; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], double 0x7FF8000000000000, double [[RET14]]
; CHECK-NEXT: [[TMP17:%.*]] = call double @llvm.fabs.f64(double [[TMP11]])
@@ -1262,12 +1262,12 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: store <2 x double> [[R2]], ptr addrspace(1) [[OUT]], align 16
; CHECK-NEXT: ret void
; CHECK: [[FREM_COMPUTE]]:
-; CHECK-NEXT: [[TMP20:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double [[AX]])
+; CHECK-NEXT: [[TMP20:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double [[AX12]])
; CHECK-NEXT: [[TMP21:%.*]] = extractvalue { double, i32 } [[TMP20]], 0
; CHECK-NEXT: [[TMP22:%.*]] = extractvalue { double, i32 } [[TMP20]], 1
; CHECK-NEXT: [[EX:%.*]] = sub i32 [[TMP22]], 1
; CHECK-NEXT: [[AX1:%.*]] = call double @llvm.ldexp.f64.i32(double [[TMP21]], i32 26)
-; CHECK-NEXT: [[TMP23:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double [[AY]])
+; CHECK-NEXT: [[TMP23:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double [[AY13]])
; CHECK-NEXT: [[TMP24:%.*]] = extractvalue { double, i32 } [[TMP23]], 0
; CHECK-NEXT: [[TMP25:%.*]] = extractvalue { double, i32 } [[TMP23]], 1
; CHECK-NEXT: [[EY:%.*]] = sub i32 [[TMP25]], 1
@@ -1277,10 +1277,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[TMP26:%.*]] = icmp sgt i32 [[NB]], 26
; CHECK-NEXT: br i1 [[TMP26]], label %[[FREM_LOOP_BODY:.*]], label %[[FREM_LOOP_EXIT]]
; CHECK: [[FREM_ELSE]]:
-; CHECK-NEXT: [[TMP27:%.*]] = call double @llvm.copysign.f64(double 0.000000e+00, double [[TMP1]])
-; CHECK-NEXT: [[TMP28:%.*]] = fcmp oeq double [[AX]], [[AY]]
-; CHECK-NEXT: [[TMP29]] = select i1 [[TMP28]], double [[TMP27]], double [[TMP1]]
-; CHECK-NEXT: br label %[[BB4]]
+; CHECK-NEXT: [[TMP28:%.*]] = call double @llvm.copysign.f64(double 0.000000e+00, double [[TMP11]])
+; CHECK-NEXT: [[TMP29:%.*]] = fcmp oeq double [[AX12]], [[AY13]]
+; CHECK-NEXT: [[TMP37]] = select i1 [[TMP29]], double [[TMP28]], double [[TMP11]]
+; CHECK-NEXT: br label %[[BB14]]
; CHECK: [[FREM_LOOP_BODY]]:
; CHECK-NEXT: [[NB_IV:%.*]] = phi i32 [ [[NB]], %[[FREM_COMPUTE]] ], [ [[NB_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ]
; CHECK-NEXT: [[AX_LOOP_PHI:%.*]] = phi double [ [[AX1]], %[[FREM_COMPUTE]] ], [ [[AX_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ]
@@ -1309,15 +1309,15 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AXP9:%.*]] = fadd double [[AX7]], [[AY2]]
; CHECK-NEXT: [[AX10:%.*]] = select i1 [[CLT8]], double [[AXP9]], double [[AX7]]
; CHECK-NEXT: [[AX11:%.*]] = call double @llvm.ldexp.f64.i32(double [[AX10]], i32 [[EY]])
-; CHECK-NEXT: [[TMP37]] = call double @llvm.copysign.f64(double [[AX11]], double [[TMP1]])
-; CHECK-NEXT: br label %[[BB4]]
+; CHECK-NEXT: [[TMP45]] = call double @llvm.copysign.f64(double [[AX11]], double [[TMP11]])
+; CHECK-NEXT: br label %[[BB14]]
; CHECK: [[FREM_COMPUTE15]]:
-; CHECK-NEXT: [[TMP38:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double [[AX12]])
+; CHECK-NEXT: [[TMP38:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double [[AX]])
; CHECK-NEXT: [[TMP39:%.*]] = extractvalue { double, i32 } [[TMP38]], 0
; CHECK-NEXT: [[TMP40:%.*]] = extractvalue { double, i32 } [[TMP38]], 1
; CHECK-NEXT: [[EX17:%.*]] = sub i32 [[TMP40]], 1
; CHECK-NEXT: [[AX18:%.*]] = call double @llvm.ldexp.f64.i32(double [[TMP39]], i32 26)
-; CHECK-NEXT: [[TMP41:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double [[AY13]])
+; CHECK-NEXT: [[TMP41:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double [[AY]])
; CHECK-NEXT: [[TMP42:%.*]] = extractvalue { double, i32 } [[TMP41]], 0
; CHECK-NEXT: [[TMP43:%.*]] = extractvalue { double, i32 } [[TMP41]], 1
; CHECK-NEXT: [[EY19:%.*]] = sub i32 [[TMP43]], 1
@@ -1327,10 +1327,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[TMP44:%.*]] = icmp sgt i32 [[NB21]], 26
; CHECK-NEXT: br i1 [[TMP44]], label %[[FREM_LOOP_BODY23:.*]], label %[[FREM_LOOP_EXIT24]]
; CHECK: [[FREM_ELSE16]]:
-; CHECK-NEXT: [[TMP45:%.*]] = call double @llvm.copysign.f64(double 0.000000e+00, double [[TMP11]])
-; CHECK-NEXT: [[TMP46:%.*]] = fcmp oeq double [[AX12]], [[AY13]]
-; CHECK-NEXT: [[TMP47]] = select i1 [[TMP46]], double [[TMP45]], double [[TMP11]]
-; CHECK-NEXT: br label %[[BB14]]
+; CHECK-NEXT: [[TMP46:%.*]] = call double @llvm.copysign.f64(double 0.000000e+00, double [[TMP1]])
+; CHECK-NEXT: [[TMP47:%.*]] = fcmp oeq double [[AX]], [[AY]]
+; CHECK-NEXT: [[TMP55]] = select i1 [[TMP47]], double [[TMP46]], double [[TMP1]]
+; CHECK-NEXT: br label %[[BB4]]
; CHECK: [[FREM_LOOP_BODY23]]:
; CHECK-NEXT: [[NB_IV25:%.*]] = phi i32 [ [[NB21]], %[[FREM_COMPUTE15]] ], [ [[NB_UPDATE33:%.*]], %[[FREM_LOOP_BODY23]] ]
; CHECK-NEXT: [[AX_LOOP_PHI26:%.*]] = phi double [ [[AX18]], %[[FREM_COMPUTE15]] ], [ [[AX_UPDATE32:%.*]], %[[FREM_LOOP_BODY23]] ]
@@ -1359,8 +1359,8 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AXP40:%.*]] = fadd double [[AX38]], [[AY20]]
; CHECK-NEXT: [[AX41:%.*]] = select i1 [[CLT39]], double [[AXP40]], double [[AX38]]
; CHECK-NEXT: [[AX42:%.*]] = call double @llvm.ldexp.f64.i32(double [[AX41]], i32 [[EY19]])
-; CHECK-NEXT: [[TMP55]] = call double @llvm.copysign.f64(double [[AX42]], double [[TMP11]])
-; CHECK-NEXT: br label %[[BB14]]
+; CHECK-NEXT: [[TMP56]] = call double @llvm.copysign.f64(double [[AX42]], double [[TMP1]])
+; CHECK-NEXT: br label %[[BB4]]
;
ptr addrspace(1) %in2) {
%gep2 = getelementptr <2 x double>, ptr addrspace(1) %in2, i32 4
diff --git a/llvm/test/Transforms/GVN/PRE/pre-load.ll b/llvm/test/Transforms/GVN/PRE/pre-load.ll
index 5a07f9f..afa1354 100644
--- a/llvm/test/Transforms/GVN/PRE/pre-load.ll
+++ b/llvm/test/Transforms/GVN/PRE/pre-load.ll
@@ -1503,3 +1503,51 @@ wrong:
exit:
ret void
}
+
+; Allow the load to be made available on the edge (%entry, %if.end) as part of PRE,
+; but ensure `%identical.l` is not hoisted to its predecessor due to the local
+; dependency with the call.
+
+define i32 @test24(ptr noalias %p, ptr noalias %q, i1 %c) {
+; MDEP-LABEL: @test24(
+; MDEP-NEXT: entry:
+; MDEP-NEXT: br i1 [[C:%.*]], label [[ENTRY_IF_END_CRIT_EDGE:%.*]], label [[IF_THEN:%.*]]
+; MDEP: entry.if.end_crit_edge:
+; MDEP-NEXT: [[VV_PRE:%.*]] = load i32, ptr [[X:%.*]], align 4
+; MDEP-NEXT: br label [[IF_END:%.*]]
+; MDEP: if.then:
+; MDEP-NEXT: call void @opaque(ptr [[X]])
+; MDEP-NEXT: [[UU:%.*]] = load i32, ptr [[X]], align 4
+; MDEP-NEXT: store i32 [[UU]], ptr [[R:%.*]], align 4
+; MDEP-NEXT: br label [[IF_END]]
+; MDEP: if.end:
+; MDEP-NEXT: [[VV:%.*]] = phi i32 [ [[VV_PRE]], [[ENTRY_IF_END_CRIT_EDGE]] ], [ [[UU]], [[IF_THEN]] ]
+; MDEP-NEXT: ret i32 [[VV]]
+;
+; MSSA-LABEL: @test24(
+; MSSA-NEXT: entry:
+; MSSA-NEXT: br i1 [[C:%.*]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; MSSA: if.then:
+; MSSA-NEXT: call void @opaque(ptr [[X:%.*]])
+; MSSA-NEXT: [[UU:%.*]] = load i32, ptr [[X]], align 4
+; MSSA-NEXT: store i32 [[UU]], ptr [[R:%.*]], align 4
+; MSSA-NEXT: br label [[IF_END]]
+; MSSA: if.end:
+; MSSA-NEXT: [[VV:%.*]] = load i32, ptr [[X]], align 4
+; MSSA-NEXT: ret i32 [[VV]]
+;
+entry:
+ br i1 %c, label %if.end, label %if.then
+
+if.then:
+ call void @opaque(ptr %p)
+ %identical.l = load i32, ptr %p, align 4
+ store i32 %identical.l, ptr %q, align 4
+ br label %if.end
+
+if.end:
+ %l = load i32, ptr %p, align 4
+ ret i32 %l
+}
+
+declare void @opaque(ptr) nounwind willreturn
diff --git a/llvm/test/Transforms/IndVarSimplify/X86/overflow-intrinsics.ll b/llvm/test/Transforms/IndVarSimplify/X86/overflow-intrinsics.ll
index cb4e07e..9b9bc68 100644
--- a/llvm/test/Transforms/IndVarSimplify/X86/overflow-intrinsics.ll
+++ b/llvm/test/Transforms/IndVarSimplify/X86/overflow-intrinsics.ll
@@ -60,8 +60,7 @@ define void @f_sadd_overflow(ptr %a) {
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[CONT:.*]] ], [ 2147483645, %[[ENTRY]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]]
; CHECK-NEXT: store i8 0, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV]], 2147483647
-; CHECK-NEXT: br i1 [[EXITCOND]], label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]]
+; CHECK-NEXT: br i1 true, label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]]
; CHECK: [[TRAP]]:
; CHECK-NEXT: tail call void @llvm.trap(), !nosanitize [[META0]]
; CHECK-NEXT: unreachable, !nosanitize [[META0]]
@@ -150,8 +149,7 @@ define void @f_uadd_overflow(ptr %a) {
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[CONT:.*]] ], [ -6, %[[ENTRY]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]]
; CHECK-NEXT: store i8 0, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV]], -1
-; CHECK-NEXT: br i1 [[EXITCOND]], label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]]
+; CHECK-NEXT: br i1 true, label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]]
; CHECK: [[TRAP]]:
; CHECK-NEXT: tail call void @llvm.trap(), !nosanitize [[META0]]
; CHECK-NEXT: unreachable, !nosanitize [[META0]]
@@ -243,10 +241,7 @@ define void @f_ssub_overflow(ptr nocapture %a) {
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[CONT:.*]] ], [ -2147483642, %[[ENTRY]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]]
; CHECK-NEXT: store i8 0, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[TMP0:%.*]] = trunc nsw i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT: [[TMP1:%.*]] = tail call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[TMP0]], i32 1)
-; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1
-; CHECK-NEXT: br i1 [[TMP2]], label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]]
+; CHECK-NEXT: br i1 true, label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]]
; CHECK: [[TRAP]]:
; CHECK-NEXT: tail call void @llvm.trap(), !nosanitize [[META0]]
; CHECK-NEXT: unreachable, !nosanitize [[META0]]
@@ -339,10 +334,7 @@ define void @f_usub_overflow(ptr nocapture %a) {
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[CONT:.*]] ], [ 15, %[[ENTRY]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]]
; CHECK-NEXT: store i8 0, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[TMP0:%.*]] = trunc nuw nsw i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT: [[TMP1:%.*]] = tail call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[TMP0]], i32 1)
-; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1
-; CHECK-NEXT: br i1 [[TMP2]], label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]]
+; CHECK-NEXT: br i1 true, label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]]
; CHECK: [[TRAP]]:
; CHECK-NEXT: tail call void @llvm.trap(), !nosanitize [[META0]]
; CHECK-NEXT: unreachable, !nosanitize [[META0]]
diff --git a/llvm/test/Transforms/IndVarSimplify/pointer-loop-guards.ll b/llvm/test/Transforms/IndVarSimplify/pointer-loop-guards.ll
index 89b132e..6732efc 100644
--- a/llvm/test/Transforms/IndVarSimplify/pointer-loop-guards.ll
+++ b/llvm/test/Transforms/IndVarSimplify/pointer-loop-guards.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
; RUN: opt -p indvars -S %s | FileCheck %s
+; RUN: opt -p indvars -data-layout='n32:64' -S %s | FileCheck --check-prefix=N32 %s
declare i1 @cond()
@@ -18,7 +19,7 @@ define i64 @test_ptr_compare_guard(ptr %start, ptr %end) {
; CHECK-NEXT: br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[EXIT_LOOPEXIT:.*]]
; CHECK: [[LOOP_LATCH]]:
; CHECK-NEXT: [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 1
-; CHECK-NEXT: [[I64_IV_NEXT]] = add i64 [[I64_IV]], 1
+; CHECK-NEXT: [[I64_IV_NEXT]] = add nuw i64 [[I64_IV]], 1
; CHECK-NEXT: [[C_2:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
; CHECK-NEXT: br i1 [[C_2]], label %[[EXIT_LOOPEXIT]], label %[[LOOP_HEADER]]
; CHECK: [[EXIT_LOOPEXIT]]:
@@ -28,6 +29,32 @@ define i64 @test_ptr_compare_guard(ptr %start, ptr %end) {
; CHECK-NEXT: [[RES:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[RES_PH]], %[[EXIT_LOOPEXIT]] ]
; CHECK-NEXT: ret i64 [[RES]]
;
+; N32-LABEL: define i64 @test_ptr_compare_guard(
+; N32-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) {
+; N32-NEXT: [[ENTRY:.*]]:
+; N32-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64
+; N32-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64
+; N32-NEXT: [[C_0:%.*]] = icmp eq ptr [[START]], [[END]]
+; N32-NEXT: br i1 [[C_0]], label %[[EXIT:.*]], label %[[LOOP_HEADER_PREHEADER:.*]]
+; N32: [[LOOP_HEADER_PREHEADER]]:
+; N32-NEXT: [[TMP0:%.*]] = add i64 [[END1]], -1
+; N32-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
+; N32-NEXT: br label %[[LOOP_HEADER:.*]]
+; N32: [[LOOP_HEADER]]:
+; N32-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[START]], %[[LOOP_HEADER_PREHEADER]] ]
+; N32-NEXT: [[C_1:%.*]] = call i1 @cond()
+; N32-NEXT: br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[EXIT_LOOPEXIT:.*]]
+; N32: [[LOOP_LATCH]]:
+; N32-NEXT: [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 1
+; N32-NEXT: [[C_2:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; N32-NEXT: br i1 [[C_2]], label %[[EXIT_LOOPEXIT]], label %[[LOOP_HEADER]]
+; N32: [[EXIT_LOOPEXIT]]:
+; N32-NEXT: [[RES_PH:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ [[TMP1]], %[[LOOP_LATCH]] ]
+; N32-NEXT: br label %[[EXIT]]
+; N32: [[EXIT]]:
+; N32-NEXT: [[RES:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[RES_PH]], %[[EXIT_LOOPEXIT]] ]
+; N32-NEXT: ret i64 [[RES]]
+;
entry:
%c.0 = icmp eq ptr %start, %end
br i1 %c.0, label %exit, label %loop.header
@@ -48,3 +75,142 @@ exit:
%res = phi i64 [ 0, %entry ], [ %i64.iv, %loop.latch ], [ 0, %loop.header ]
ret i64 %res
}
+
+define void @test_sub_cmp(ptr align 8 %start, ptr %end) {
+; CHECK-LABEL: define void @test_sub_cmp(
+; CHECK-SAME: ptr align 8 [[START:%.*]], ptr [[END:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[START_INT:%.*]] = ptrtoint ptr [[START]] to i64
+; CHECK-NEXT: [[END_INT:%.*]] = ptrtoint ptr [[END]] to i64
+; CHECK-NEXT: [[PTR_DIFF:%.*]] = sub i64 [[START_INT]], [[END_INT]]
+; CHECK-NEXT: [[CMP_ENTRY:%.*]] = icmp eq ptr [[START]], [[END]]
+; CHECK-NEXT: br i1 [[CMP_ENTRY]], label %[[EXIT:.*]], label %[[LOOP_HEADER_PREHEADER:.*]]
+; CHECK: [[LOOP_HEADER_PREHEADER]]:
+; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
+; CHECK: [[LOOP_HEADER]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ 0, %[[LOOP_HEADER_PREHEADER]] ]
+; CHECK-NEXT: [[C_1:%.*]] = call i1 @cond()
+; CHECK-NEXT: br i1 [[C_1]], label %[[EXIT_EARLY:.*]], label %[[LOOP_LATCH]]
+; CHECK: [[LOOP_LATCH]]:
+; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 1
+; CHECK-NEXT: [[CMP_LATCH:%.*]] = icmp ult i64 [[IV_NEXT]], [[PTR_DIFF]]
+; CHECK-NEXT: br i1 [[CMP_LATCH]], label %[[LOOP_HEADER]], label %[[EXIT_LOOPEXIT:.*]]
+; CHECK: [[EXIT_EARLY]]:
+; CHECK-NEXT: br label %[[EXIT]]
+; CHECK: [[EXIT_LOOPEXIT]]:
+; CHECK-NEXT: br label %[[EXIT]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+; N32-LABEL: define void @test_sub_cmp(
+; N32-SAME: ptr align 8 [[START:%.*]], ptr [[END:%.*]]) {
+; N32-NEXT: [[ENTRY:.*:]]
+; N32-NEXT: [[START_INT:%.*]] = ptrtoint ptr [[START]] to i64
+; N32-NEXT: [[END_INT:%.*]] = ptrtoint ptr [[END]] to i64
+; N32-NEXT: [[PTR_DIFF:%.*]] = sub i64 [[START_INT]], [[END_INT]]
+; N32-NEXT: [[CMP_ENTRY:%.*]] = icmp eq ptr [[START]], [[END]]
+; N32-NEXT: br i1 [[CMP_ENTRY]], label %[[EXIT:.*]], label %[[LOOP_HEADER_PREHEADER:.*]]
+; N32: [[LOOP_HEADER_PREHEADER]]:
+; N32-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[PTR_DIFF]], i64 1)
+; N32-NEXT: br label %[[LOOP_HEADER:.*]]
+; N32: [[LOOP_HEADER]]:
+; N32-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ 0, %[[LOOP_HEADER_PREHEADER]] ]
+; N32-NEXT: [[C_1:%.*]] = call i1 @cond()
+; N32-NEXT: br i1 [[C_1]], label %[[EXIT_EARLY:.*]], label %[[LOOP_LATCH]]
+; N32: [[LOOP_LATCH]]:
+; N32-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 1
+; N32-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], [[UMAX]]
+; N32-NEXT: br i1 [[EXITCOND]], label %[[LOOP_HEADER]], label %[[EXIT_LOOPEXIT:.*]]
+; N32: [[EXIT_EARLY]]:
+; N32-NEXT: br label %[[EXIT]]
+; N32: [[EXIT_LOOPEXIT]]:
+; N32-NEXT: br label %[[EXIT]]
+; N32: [[EXIT]]:
+; N32-NEXT: ret void
+;
+entry:
+ %start.int = ptrtoint ptr %start to i64
+ %end.int = ptrtoint ptr %end to i64
+ %ptr.diff = sub i64 %start.int, %end.int
+ %cmp.entry = icmp eq ptr %start, %end
+ br i1 %cmp.entry, label %exit, label %loop.header
+
+loop.header:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+ %c.1 = call i1 @cond()
+ br i1 %c.1, label %exit.early, label %loop.latch
+
+loop.latch:
+ %iv.next = add i64 %iv, 1
+ %cmp.latch = icmp ult i64 %iv.next, %ptr.diff
+ br i1 %cmp.latch, label %loop.header, label %exit
+
+exit.early:
+ br label %exit
+
+exit:
+ ret void
+}
+
+
+define void @test_ptr_diff_with_assume(ptr align 8 %start, ptr align 8 %end, ptr %P) {
+; CHECK-LABEL: define void @test_ptr_diff_with_assume(
+; CHECK-SAME: ptr align 8 [[START:%.*]], ptr align 8 [[END:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[START_INT:%.*]] = ptrtoint ptr [[START]] to i64
+; CHECK-NEXT: [[END_INT:%.*]] = ptrtoint ptr [[END]] to i64
+; CHECK-NEXT: [[PTR_DIFF:%.*]] = sub i64 [[START_INT]], [[END_INT]]
+; CHECK-NEXT: [[DIFF_CMP:%.*]] = icmp ult i64 [[PTR_DIFF]], 2
+; CHECK-NEXT: call void @llvm.assume(i1 [[DIFF_CMP]])
+; CHECK-NEXT: [[ENTRY_CMP:%.*]] = icmp eq ptr [[START]], [[END]]
+; CHECK-NEXT: br i1 [[ENTRY_CMP]], label %[[EXIT:.*]], label %[[LOOP_BODY_PREHEADER:.*]]
+; CHECK: [[LOOP_BODY_PREHEADER]]:
+; CHECK-NEXT: br label %[[LOOP_BODY:.*]]
+; CHECK: [[LOOP_BODY]]:
+; CHECK-NEXT: [[TMP0:%.*]] = call i1 @cond()
+; CHECK-NEXT: br i1 true, label %[[EXIT_LOOPEXIT:.*]], label %[[LOOP_BODY]]
+; CHECK: [[EXIT_LOOPEXIT]]:
+; CHECK-NEXT: br label %[[EXIT]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+; N32-LABEL: define void @test_ptr_diff_with_assume(
+; N32-SAME: ptr align 8 [[START:%.*]], ptr align 8 [[END:%.*]], ptr [[P:%.*]]) {
+; N32-NEXT: [[ENTRY:.*:]]
+; N32-NEXT: [[START_INT:%.*]] = ptrtoint ptr [[START]] to i64
+; N32-NEXT: [[END_INT:%.*]] = ptrtoint ptr [[END]] to i64
+; N32-NEXT: [[PTR_DIFF:%.*]] = sub i64 [[START_INT]], [[END_INT]]
+; N32-NEXT: [[DIFF_CMP:%.*]] = icmp ult i64 [[PTR_DIFF]], 2
+; N32-NEXT: call void @llvm.assume(i1 [[DIFF_CMP]])
+; N32-NEXT: [[ENTRY_CMP:%.*]] = icmp eq ptr [[START]], [[END]]
+; N32-NEXT: br i1 [[ENTRY_CMP]], label %[[EXIT:.*]], label %[[LOOP_BODY_PREHEADER:.*]]
+; N32: [[LOOP_BODY_PREHEADER]]:
+; N32-NEXT: br label %[[LOOP_BODY:.*]]
+; N32: [[LOOP_BODY]]:
+; N32-NEXT: [[TMP0:%.*]] = call i1 @cond()
+; N32-NEXT: br i1 true, label %[[EXIT_LOOPEXIT:.*]], label %[[LOOP_BODY]]
+; N32: [[EXIT_LOOPEXIT]]:
+; N32-NEXT: br label %[[EXIT]]
+; N32: [[EXIT]]:
+; N32-NEXT: ret void
+;
+entry:
+ %start.int = ptrtoint ptr %start to i64
+ %end.int = ptrtoint ptr %end to i64
+ %ptr.diff = sub i64 %start.int, %end.int
+ %diff.cmp = icmp ult i64 %ptr.diff, 2
+ call void @llvm.assume(i1 %diff.cmp)
+ %computed.end = getelementptr i8, ptr %start, i64 %ptr.diff
+ %entry.cmp = icmp eq ptr %start, %end
+ br i1 %entry.cmp, label %exit, label %loop.body
+
+loop.body:
+ %iv = phi ptr [ %start, %entry ], [ %iv.next, %loop.body ]
+ call i1 @cond()
+ %iv.next = getelementptr i8, ptr %iv, i64 1
+ %loop.cmp = icmp eq ptr %iv.next, %computed.end
+ br i1 %loop.cmp, label %exit, label %loop.body
+
+exit:
+ ret void
+}
diff --git a/llvm/test/Transforms/IndVarSimplify/unreachable-exit.ll b/llvm/test/Transforms/IndVarSimplify/unreachable-exit.ll
new file mode 100644
index 0000000..b9c9228
--- /dev/null
+++ b/llvm/test/Transforms/IndVarSimplify/unreachable-exit.ll
@@ -0,0 +1,738 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=indvars < %s | FileCheck %s
+
+define void @optimize_trap(i32 %block_size) {
+; CHECK-LABEL: define void @optimize_trap(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK: [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[BLOCK_SIZE]], -1
+; CHECK-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP1]], i32 3)
+; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 3, [[UMIN]]
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT: ret void
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: br i1 [[TMP2]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK: [[IF_THEN]]:
+; CHECK-NEXT: call void @llvm.trap()
+; CHECK-NEXT: unreachable
+; CHECK: [[IF_END4]]:
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[TMP4:%.*]] = xor i8 [[TMP3]], 54
+; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: store i8 [[TMP4]], ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+ %foo_arr = alloca [2 x i8], align 16
+ %bar_arr = alloca [2 x i8], align 16
+ call void @x(ptr nonnull %foo_arr)
+ %cmp14.not = icmp eq i32 %block_size, 0
+ br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %if.end4
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ call void @x(ptr nonnull %bar_arr)
+ ret void
+
+for.body: ; preds = %for.body.preheader, %if.end4
+ %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+ %cmp1 = icmp samesign ugt i32 %i.015, 2
+ br i1 %cmp1, label %if.then, label %if.end4
+
+if.then: ; preds = %for.body
+ call void @llvm.trap()
+ unreachable
+
+if.end4: ; preds = %for.body
+ %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+ %0 = load i8, ptr %arrayidx, align 1
+ %1 = xor i8 %0, 54
+ %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+ store i8 %1, ptr %arrayidx7, align 1
+ %inc = add nuw nsw i32 %i.015, 1
+ %cmp = icmp ult i32 %inc, %block_size
+ br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+define void @no_optimize_atomic(i32 %block_size) {
+; CHECK-LABEL: define void @no_optimize_atomic(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK: [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT: ret void
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2
+; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK: [[IF_THEN]]:
+; CHECK-NEXT: call void @llvm.trap()
+; CHECK-NEXT: unreachable
+; CHECK: [[IF_END4]]:
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[TMP4:%.*]] = xor i8 [[TMP3]], 54
+; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: store atomic i8 [[TMP4]], ptr [[ARRAYIDX7]] unordered, align 1
+; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+ %foo_arr = alloca [2 x i8], align 16
+ %bar_arr = alloca [2 x i8], align 16
+ call void @x(ptr nonnull %foo_arr)
+ %cmp14.not = icmp eq i32 %block_size, 0
+ br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %if.end4
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ call void @x(ptr nonnull %bar_arr)
+ ret void
+
+for.body: ; preds = %for.body.preheader, %if.end4
+ %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+ %cmp1 = icmp samesign ugt i32 %i.015, 2
+ br i1 %cmp1, label %if.then, label %if.end4
+
+if.then: ; preds = %for.body
+ call void @llvm.trap()
+ unreachable
+
+if.end4: ; preds = %for.body
+ %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+ %0 = load i8, ptr %arrayidx, align 1
+ %1 = xor i8 %0, 54
+ %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+ store atomic i8 %1, ptr %arrayidx7 unordered, align 1
+ %inc = add nuw nsw i32 %i.015, 1
+ %cmp = icmp ult i32 %inc, %block_size
+ br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+define void @no_optimize_volatile(i32 %block_size) {
+; CHECK-LABEL: define void @no_optimize_volatile(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK: [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT: ret void
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2
+; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK: [[IF_THEN]]:
+; CHECK-NEXT: call void @llvm.trap()
+; CHECK-NEXT: unreachable
+; CHECK: [[IF_END4]]:
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[TMP4:%.*]] = xor i8 [[TMP3]], 54
+; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: store volatile i8 [[TMP4]], ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+ %foo_arr = alloca [2 x i8], align 16
+ %bar_arr = alloca [2 x i8], align 16
+ call void @x(ptr nonnull %foo_arr)
+ %cmp14.not = icmp eq i32 %block_size, 0
+ br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %if.end4
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ call void @x(ptr nonnull %bar_arr)
+ ret void
+
+for.body: ; preds = %for.body.preheader, %if.end4
+ %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+ %cmp1 = icmp samesign ugt i32 %i.015, 2
+ br i1 %cmp1, label %if.then, label %if.end4
+
+if.then: ; preds = %for.body
+ call void @llvm.trap()
+ unreachable
+
+if.end4: ; preds = %for.body
+ %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+ %0 = load i8, ptr %arrayidx, align 1
+ %1 = xor i8 %0, 54
+ %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+ store volatile i8 %1, ptr %arrayidx7, align 1
+ %inc = add nuw nsw i32 %i.015, 1
+ %cmp = icmp ult i32 %inc, %block_size
+ br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+define void @no_optimize_call(i32 %block_size) {
+; CHECK-LABEL: define void @no_optimize_call(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK: [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT: ret void
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2
+; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK: [[IF_THEN]]:
+; CHECK-NEXT: call void @llvm.trap()
+; CHECK-NEXT: unreachable
+; CHECK: [[IF_END4]]:
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[TMP4:%.*]] = xor i8 [[TMP3]], 54
+; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: call void @x(ptr null)
+; CHECK-NEXT: store volatile i8 [[TMP4]], ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+ %foo_arr = alloca [2 x i8], align 16
+ %bar_arr = alloca [2 x i8], align 16
+ call void @x(ptr nonnull %foo_arr)
+ %cmp14.not = icmp eq i32 %block_size, 0
+ br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %if.end4
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ call void @x(ptr nonnull %bar_arr)
+ ret void
+
+for.body: ; preds = %for.body.preheader, %if.end4
+ %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+ %cmp1 = icmp samesign ugt i32 %i.015, 2
+ br i1 %cmp1, label %if.then, label %if.end4
+
+if.then: ; preds = %for.body
+ call void @llvm.trap()
+ unreachable
+
+if.end4: ; preds = %for.body
+ %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+ %0 = load i8, ptr %arrayidx, align 1
+ %1 = xor i8 %0, 54
+ %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+ call void @x(ptr null)
+ store volatile i8 %1, ptr %arrayidx7, align 1
+ %inc = add nuw nsw i32 %i.015, 1
+ %cmp = icmp ult i32 %inc, %block_size
+ br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+define void @optimize_ubsan_trap(i32 %block_size) {
+; CHECK-LABEL: define void @optimize_ubsan_trap(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK: [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[BLOCK_SIZE]], -1
+; CHECK-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP1]], i32 3)
+; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 3, [[UMIN]]
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT: ret void
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: br i1 [[TMP2]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK: [[IF_THEN]]:
+; CHECK-NEXT: call void @llvm.ubsantrap(i8 1)
+; CHECK-NEXT: unreachable
+; CHECK: [[IF_END4]]:
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[TMP4:%.*]] = xor i8 [[TMP3]], 54
+; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: store i8 [[TMP4]], ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+ %foo_arr = alloca [2 x i8], align 16
+ %bar_arr = alloca [2 x i8], align 16
+ call void @x(ptr nonnull %foo_arr)
+ %cmp14.not = icmp eq i32 %block_size, 0
+ br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %if.end4
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ call void @x(ptr nonnull %bar_arr)
+ ret void
+
+for.body: ; preds = %for.body.preheader, %if.end4
+ %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+ %cmp1 = icmp samesign ugt i32 %i.015, 2
+ br i1 %cmp1, label %if.then, label %if.end4
+
+if.then: ; preds = %for.body
+ call void @llvm.ubsantrap(i8 1)
+ unreachable
+
+if.end4: ; preds = %for.body
+ %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+ %0 = load i8, ptr %arrayidx, align 1
+ %1 = xor i8 %0, 54
+ %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+ store i8 %1, ptr %arrayidx7, align 1
+ %inc = add nuw nsw i32 %i.015, 1
+ %cmp = icmp ult i32 %inc, %block_size
+ br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+define void @no_optimize_arbitrary_call(i32 %block_size) {
+; CHECK-LABEL: define void @no_optimize_arbitrary_call(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK: [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT: ret void
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2
+; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK: [[IF_THEN]]:
+; CHECK-NEXT: call void @noreturn_with_argmem(ptr [[FOO_ARR]])
+; CHECK-NEXT: unreachable
+; CHECK: [[IF_END4]]:
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[TMP4:%.*]] = xor i8 [[TMP3]], 54
+; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: store i8 [[TMP4]], ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+ %foo_arr = alloca [2 x i8], align 16
+ %bar_arr = alloca [2 x i8], align 16
+ call void @x(ptr nonnull %foo_arr)
+ %cmp14.not = icmp eq i32 %block_size, 0
+ br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %if.end4
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ call void @x(ptr nonnull %bar_arr)
+ ret void
+
+for.body: ; preds = %for.body.preheader, %if.end4
+ %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+ %cmp1 = icmp samesign ugt i32 %i.015, 2
+ br i1 %cmp1, label %if.then, label %if.end4
+
+if.then: ; preds = %for.body
+ call void @noreturn_with_argmem(ptr %foo_arr)
+ unreachable
+
+if.end4: ; preds = %for.body
+ %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+ %0 = load i8, ptr %arrayidx, align 1
+ %1 = xor i8 %0, 54
+ %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+ store i8 %1, ptr %arrayidx7, align 1
+ %inc = add nuw nsw i32 %i.015, 1
+ %cmp = icmp ult i32 %inc, %block_size
+ br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+define void @no_optimize_two_exits(i32 %block_size) {
+; CHECK-LABEL: define void @no_optimize_two_exits(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK: [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT: ret void
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: [[P:%.*]] = call i1 @pred()
+; CHECK-NEXT: br i1 [[P]], label %[[FOR_BODY_CONT:.*]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+; CHECK: [[FOR_BODY_CONT]]:
+; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2
+; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK: [[IF_THEN]]:
+; CHECK-NEXT: call void @noreturn(ptr [[FOO_ARR]])
+; CHECK-NEXT: unreachable
+; CHECK: [[IF_END4]]:
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[TMP0]], 54
+; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: store i8 [[TMP1]], ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+ %foo_arr = alloca [2 x i8], align 16
+ %bar_arr = alloca [2 x i8], align 16
+ call void @x(ptr nonnull %foo_arr)
+ %cmp14.not = icmp eq i32 %block_size, 0
+ br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %if.end4
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ call void @x(ptr nonnull %bar_arr)
+ ret void
+
+for.body:
+ %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+ %p = call i1 @pred()
+ br i1 %p, label %for.body.cont, label %for.cond.cleanup.loopexit
+
+for.body.cont: ; preds = %for.body.preheader, %if.end4
+ %cmp1 = icmp samesign ugt i32 %i.015, 2
+ br i1 %cmp1, label %if.then, label %if.end4
+
+if.then: ; preds = %for.body
+ call void @noreturn(ptr %foo_arr)
+ unreachable
+
+if.end4: ; preds = %for.body
+ %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+ %0 = load i8, ptr %arrayidx, align 1
+ %1 = xor i8 %0, 54
+ %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+ store i8 %1, ptr %arrayidx7, align 1
+ %inc = add nuw nsw i32 %i.015, 1
+ %cmp = icmp ult i32 %inc, %block_size
+ br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+define void @no_optimize_two_exits2(i32 %block_size) {
+; CHECK-LABEL: define void @no_optimize_two_exits2(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK: [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT: ret void
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2
+; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[FOR_BODY_CONT:.*]]
+; CHECK: [[FOR_BODY_CONT]]:
+; CHECK-NEXT: [[P:%.*]] = call i1 @pred()
+; CHECK-NEXT: br i1 [[P]], label %[[IF_END4]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+; CHECK: [[IF_THEN]]:
+; CHECK-NEXT: call void @noreturn(ptr [[FOO_ARR]])
+; CHECK-NEXT: unreachable
+; CHECK: [[IF_END4]]:
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[TMP0]], 54
+; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: store i8 [[TMP1]], ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+ %foo_arr = alloca [2 x i8], align 16
+ %bar_arr = alloca [2 x i8], align 16
+ call void @x(ptr nonnull %foo_arr)
+ %cmp14.not = icmp eq i32 %block_size, 0
+ br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %if.end4
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ call void @x(ptr nonnull %bar_arr)
+ ret void
+
+for.body:
+ %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+ %cmp1 = icmp samesign ugt i32 %i.015, 2
+ br i1 %cmp1, label %if.then, label %for.body.cont
+
+for.body.cont: ; preds = %for.body.preheader, %if.end4
+ %p = call i1 @pred()
+ br i1 %p, label %if.end4, label %for.cond.cleanup.loopexit
+
+if.then: ; preds = %for.body
+ call void @noreturn(ptr %foo_arr)
+ unreachable
+
+if.end4: ; preds = %for.body
+ %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+ %0 = load i8, ptr %arrayidx, align 1
+ %1 = xor i8 %0, 54
+ %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+ store i8 %1, ptr %arrayidx7, align 1
+ %inc = add nuw nsw i32 %i.015, 1
+ %cmp = icmp ult i32 %inc, %block_size
+ br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+define void @no_optimize_depdendent_ubsan_trap(i32 %block_size) {
+; CHECK-LABEL: define void @no_optimize_depdendent_ubsan_trap(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK: [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT: ret void
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2
+; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK: [[IF_THEN]]:
+; CHECK-NEXT: [[I_015_LCSSA:%.*]] = phi i32 [ [[I_015]], %[[FOR_BODY]] ]
+; CHECK-NEXT: call void @noreturn_with_i32(i32 [[I_015_LCSSA]])
+; CHECK-NEXT: unreachable
+; CHECK: [[IF_END4]]:
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[TMP0]], 54
+; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: store i8 [[TMP1]], ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+ %foo_arr = alloca [2 x i8], align 16
+ %bar_arr = alloca [2 x i8], align 16
+ call void @x(ptr nonnull %foo_arr)
+ %cmp14.not = icmp eq i32 %block_size, 0
+ br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %if.end4
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ call void @x(ptr nonnull %bar_arr)
+ ret void
+
+for.body: ; preds = %for.body.preheader, %if.end4
+ %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+ %cmp1 = icmp samesign ugt i32 %i.015, 2
+ br i1 %cmp1, label %if.then, label %if.end4
+
+if.then: ; preds = %for.body
+ call void @noreturn_with_i32(i32 %i.015)
+ unreachable
+
+if.end4: ; preds = %for.body
+ %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+ %0 = load i8, ptr %arrayidx, align 1
+ %1 = xor i8 %0, 54
+ %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+ store i8 %1, ptr %arrayidx7, align 1
+ %inc = add nuw nsw i32 %i.015, 1
+ %cmp = icmp ult i32 %inc, %block_size
+ br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+define void @no_optimize_depdendent_load_trap(i32 %block_size) {
+; CHECK-LABEL: define void @no_optimize_depdendent_load_trap(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT: call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT: [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT: br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK: [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT: ret void
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2
+; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK: [[IF_THEN]]:
+; CHECK-NEXT: [[I_015_LCSSA:%.*]] = load i8, ptr [[FOO_ARR]], align 1
+; CHECK-NEXT: call void @noreturn_with_i8(i8 [[I_015_LCSSA]])
+; CHECK-NEXT: unreachable
+; CHECK: [[IF_END4]]:
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[TMP0]], 54
+; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT: store i8 [[TMP1]], ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+ %foo_arr = alloca [2 x i8], align 16
+ %bar_arr = alloca [2 x i8], align 16
+ call void @x(ptr nonnull %foo_arr)
+ %cmp14.not = icmp eq i32 %block_size, 0
+ br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %if.end4
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ call void @x(ptr nonnull %bar_arr)
+ ret void
+
+for.body: ; preds = %for.body.preheader, %if.end4
+ %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+ %cmp1 = icmp samesign ugt i32 %i.015, 2
+ br i1 %cmp1, label %if.then, label %if.end4
+
+if.then: ; preds = %for.body
+ %r = load i8, ptr %foo_arr, align 1
+ call void @noreturn_with_i8(i8 %r)
+ unreachable
+
+if.end4: ; preds = %for.body
+ %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+ %0 = load i8, ptr %arrayidx, align 1
+ %1 = xor i8 %0, 54
+ %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+ store i8 %1, ptr %arrayidx7, align 1
+ %inc = add nuw nsw i32 %i.015, 1
+ %cmp = icmp ult i32 %inc, %block_size
+ br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+
+declare void @x(ptr noundef) local_unnamed_addr
+declare i1 @pred() local_unnamed_addr
+
+declare void @llvm.trap() #0
+declare void @noreturn(ptr) #0
+declare void @noreturn_with_i32(i32) #0
+declare void @noreturn_with_i8(i8) #0
+declare void @noreturn_with_argmem(ptr) #1
+
+attributes #0 = { cold noreturn nounwind memory(inaccessiblemem: write) }
+attributes #1 = { cold noreturn nounwind memory(argmem: read) }
diff --git a/llvm/test/Transforms/Inline/ML/state-accounting-skip-non-cold.ll b/llvm/test/Transforms/Inline/ML/state-accounting-skip-non-cold.ll
new file mode 100644
index 0000000..0887f5e
--- /dev/null
+++ b/llvm/test/Transforms/Inline/ML/state-accounting-skip-non-cold.ll
@@ -0,0 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; REQUIRES: llvm_inliner_model_autogenerated && asserts
+; RUN: opt -passes='default<O3>' -enable-ml-inliner=release -ml-inliner-skip-policy=if-caller-not-cold -S %s -o - | FileCheck %s
+; RUN: opt -passes='default<O3>' -ml-inliner-stop-immediately -enable-ml-inliner=release -ml-inliner-skip-policy=if-caller-not-cold -S %s -o - | FileCheck %s
+
+declare ptr @f()
+
+define void @e() #0 {
+; CHECK-LABEL: define void @e(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: tail call void @d()
+; CHECK-NEXT: tail call void @g()
+; CHECK-NEXT: tail call void @d()
+; CHECK-NEXT: tail call void @g()
+; CHECK-NEXT: tail call void @d()
+; CHECK-NEXT: tail call void @g()
+; CHECK-NEXT: ret void
+;
+ call void @h()
+ call void @h()
+ call void @h()
+ ret void
+}
+
+define void @d() {
+; CHECK-LABEL: define void @d() local_unnamed_addr {
+; CHECK-NEXT: tail call void @f()
+; CHECK-NEXT: ret void
+;
+ call void @f()
+ ret void
+}
+
+define void @g() {
+; CHECK-LABEL: define void @g() local_unnamed_addr {
+; CHECK-NEXT: tail call void @f()
+; CHECK-NEXT: ret void
+;
+ call void @f()
+ ret void
+}
+
+define void @h() #0 {
+; CHECK-LABEL: define void @h(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT: tail call void @d()
+; CHECK-NEXT: tail call void @g()
+; CHECK-NEXT: ret void
+;
+ call void @d()
+ call void @g()
+ ret void
+}
+
+attributes #0 = { "sign-return-address"="non-leaf" "sign-return-address-key"="a_key" }
diff --git a/llvm/test/Transforms/InstCombine/add-sitofp.ll b/llvm/test/Transforms/InstCombine/add-sitofp.ll
index fae1365..e1d39fd 100644
--- a/llvm/test/Transforms/InstCombine/add-sitofp.ll
+++ b/llvm/test/Transforms/InstCombine/add-sitofp.ll
@@ -99,12 +99,15 @@ define float @test_3(i32 %a, i32 %b) {
ret float %p
}
+; Don't perform the fold on vector operations, as the integer op may be
+; much more expensive than the float op in that case.
define <4 x double> @test_4(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: @test_4(
; CHECK-NEXT: [[A_AND:%.*]] = and <4 x i32> [[A:%.*]], splat (i32 1073741823)
; CHECK-NEXT: [[B_AND:%.*]] = and <4 x i32> [[B:%.*]], splat (i32 1073741823)
-; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw <4 x i32> [[A_AND]], [[B_AND]]
-; CHECK-NEXT: [[RES:%.*]] = uitofp nneg <4 x i32> [[TMP1]] to <4 x double>
+; CHECK-NEXT: [[A_AND_FP:%.*]] = uitofp nneg <4 x i32> [[A_AND]] to <4 x double>
+; CHECK-NEXT: [[B_AND_FP:%.*]] = uitofp nneg <4 x i32> [[B_AND]] to <4 x double>
+; CHECK-NEXT: [[RES:%.*]] = fadd <4 x double> [[A_AND_FP]], [[B_AND_FP]]
; CHECK-NEXT: ret <4 x double> [[RES]]
;
; Drop two highest bits to guarantee that %a + %b doesn't overflow
diff --git a/llvm/test/Transforms/InstCombine/binop-itofp.ll b/llvm/test/Transforms/InstCombine/binop-itofp.ll
index 702bbbb..57184ea 100644
--- a/llvm/test/Transforms/InstCombine/binop-itofp.ll
+++ b/llvm/test/Transforms/InstCombine/binop-itofp.ll
@@ -1063,6 +1063,25 @@ define float @negzero_check_on_constant_for_si_fmul(i1 %c, i1 %.b, ptr %g_2345)
ret float %mul3.i.i
}
+; Don't perform the fold on vector operations, as the integer op may be
+; much more expensive than the float op in that case.
+define <2 x half> @test_ui_ui_i8_mul_vec(<2 x i8> noundef %x_in, <2 x i8> noundef %y_in) {
+; CHECK-LABEL: @test_ui_ui_i8_mul_vec(
+; CHECK-NEXT: [[X:%.*]] = and <2 x i8> [[X_IN:%.*]], splat (i8 15)
+; CHECK-NEXT: [[Y:%.*]] = and <2 x i8> [[Y_IN:%.*]], splat (i8 15)
+; CHECK-NEXT: [[XF:%.*]] = uitofp nneg <2 x i8> [[X]] to <2 x half>
+; CHECK-NEXT: [[YF:%.*]] = uitofp nneg <2 x i8> [[Y]] to <2 x half>
+; CHECK-NEXT: [[R:%.*]] = fmul <2 x half> [[XF]], [[YF]]
+; CHECK-NEXT: ret <2 x half> [[R]]
+;
+ %x = and <2 x i8> %x_in, splat (i8 15)
+ %y = and <2 x i8> %y_in, splat (i8 15)
+ %xf = uitofp <2 x i8> %x to <2 x half>
+ %yf = uitofp <2 x i8> %y to <2 x half>
+ %r = fmul <2 x half> %xf, %yf
+ ret <2 x half> %r
+}
+
define <2 x float> @nonzero_check_on_constant_for_si_fmul_vec_w_poison(i1 %c, i1 %.b, ptr %g_2345) {
; CHECK-LABEL: @nonzero_check_on_constant_for_si_fmul_vec_w_poison(
; CHECK-NEXT: [[SEL:%.*]] = select i1 [[C:%.*]], i32 65529, i32 53264
@@ -1091,8 +1110,9 @@ define <2 x float> @nonzero_check_on_constant_for_si_fmul_nz_vec_w_poison(i1 %c,
; CHECK-NEXT: [[CONV_I_V:%.*]] = insertelement <2 x i16> poison, i16 [[CONV_I_S]], i64 0
; CHECK-NEXT: [[CONV_I:%.*]] = shufflevector <2 x i16> [[CONV_I_V]], <2 x i16> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[MUL3_I_I:%.*]] = sitofp <2 x i16> [[CONV_I]] to <2 x float>
+; CHECK-NEXT: [[MUL3_I_I1:%.*]] = fmul <2 x float> [[MUL3_I_I]], <float poison, float 1.000000e+00>
; CHECK-NEXT: store i32 [[SEL]], ptr [[G_2345:%.*]], align 4
-; CHECK-NEXT: ret <2 x float> [[MUL3_I_I]]
+; CHECK-NEXT: ret <2 x float> [[MUL3_I_I1]]
;
%sel = select i1 %c, i32 65529, i32 53264
%conv.i.s = trunc i32 %sel to i16
diff --git a/llvm/test/Transforms/InstCombine/cast-set-preserve-signed-dbg-val.ll b/llvm/test/Transforms/InstCombine/cast-set-preserve-signed-dbg-val.ll
index 7cc4446..ad45d1e 100644
--- a/llvm/test/Transforms/InstCombine/cast-set-preserve-signed-dbg-val.ll
+++ b/llvm/test/Transforms/InstCombine/cast-set-preserve-signed-dbg-val.ll
@@ -11,10 +11,8 @@ define i16 @test5(i16 %A) !dbg !34 {
call void @llvm.dbg.value(metadata i32 %C, metadata !37, metadata !DIExpression()), !dbg !41
; Preserve the dbg.value for the DCE'd 32-bit 'and'.
- ;
- ; The high 16 bits of the original 'and' require sign-extending the new 16-bit and:
; CHECK-NEXT: #dbg_value(i16 [[and]], [[C:![0-9]+]],
- ; CHECK-SAME: !DIExpression(DW_OP_LLVM_convert, 16, DW_ATE_signed, DW_OP_LLVM_convert, 32, DW_ATE_signed, DW_OP_stack_value)
+ ; CHECK-SAME: !DIExpression(DW_OP_LLVM_convert, 16, DW_ATE_unsigned, DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_stack_value)
%D = trunc i32 %C to i16, !dbg !42
call void @llvm.dbg.value(metadata i16 %D, metadata !38, metadata !DIExpression()), !dbg !42
diff --git a/llvm/test/Transforms/InstCombine/icmp-trunc.ll b/llvm/test/Transforms/InstCombine/icmp-trunc.ll
index b85deab..ad76ef7 100644
--- a/llvm/test/Transforms/InstCombine/icmp-trunc.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-trunc.ll
@@ -3,6 +3,7 @@
; RUN: opt < %s -passes=instcombine -S -data-layout="n8" | FileCheck %s --check-prefixes=CHECK,DL8
declare void @use(i8)
+declare void @use2(i4)
define i1 @ult_2(i32 %x) {
; CHECK-LABEL: @ult_2(
@@ -785,3 +786,32 @@ define <2 x i1> @uge_nsw_non_splat(<2 x i32> %x) {
ret <2 x i1> %r
}
+define i1 @trunc_icmp(i8 %a0) {
+; CHECK-LABEL: @trunc_icmp(
+; CHECK-NEXT: [[TZ:%.*]] = tail call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[A0:%.*]], i1 false)
+; CHECK-NEXT: [[TR:%.*]] = trunc nuw i8 [[TZ]] to i4
+; CHECK-NEXT: [[C:%.*]] = icmp eq i8 [[A0]], 0
+; CHECK-NEXT: call void @use2(i4 [[TR]])
+; CHECK-NEXT: ret i1 [[C]]
+;
+ %tz = tail call range(i8 0, 9) i8 @llvm.cttz.i8(i8 %a0, i1 false)
+ %tr = trunc i8 %tz to i4
+ %c = icmp eq i4 %tr, 8
+ call void @use2(i4 %tr)
+ ret i1 %c
+}
+
+define i1 @do_not_mask_trunc_eq_i32_i8(i32 %x) {
+; DL64-LABEL: @do_not_mask_trunc_eq_i32_i8(
+; DL64-NEXT: [[R:%.*]] = icmp eq i32 [[X:%.*]], 42
+; DL64-NEXT: ret i1 [[R]]
+;
+; DL8-LABEL: @do_not_mask_trunc_eq_i32_i8(
+; DL8-NEXT: [[T:%.*]] = trunc nuw i32 [[X:%.*]] to i8
+; DL8-NEXT: [[R:%.*]] = icmp eq i8 [[T]], 42
+; DL8-NEXT: ret i1 [[R]]
+;
+ %t = trunc nuw i32 %x to i8
+ %r = icmp eq i8 %t, 42
+ ret i1 %r
+}
diff --git a/llvm/test/Transforms/InstCombine/ptr-int-cast.ll b/llvm/test/Transforms/InstCombine/ptr-int-cast.ll
index 69b8f69..82ecbd4 100644
--- a/llvm/test/Transforms/InstCombine/ptr-int-cast.ll
+++ b/llvm/test/Transforms/InstCombine/ptr-int-cast.ll
@@ -86,3 +86,14 @@ define <4 x ptr> @test7(<4 x i128> %arg) nounwind {
%p1 = inttoptr <4 x i128> %arg to <4 x ptr>
ret <4 x ptr> %p1
}
+
+define i64 @ptrtoint_gep_sub(ptr %ptr, i64 %end.addr) {
+; CHECK-LABEL: @ptrtoint_gep_sub(
+; CHECK-NEXT: ret i64 [[END_ADDR:%.*]]
+;
+ %ptr.addr = ptrtoint ptr %ptr to i64
+ %size = sub i64 %end.addr, %ptr.addr
+ %end = getelementptr i8, ptr %ptr, i64 %size
+ %end.addr2 = ptrtoint ptr %end to i64
+ ret i64 %end.addr2
+}
diff --git a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
index 7b0b152..ffaa8b1 100644
--- a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
+++ b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
@@ -23,10 +23,7 @@ define i64 @ptrtoaddr_inttoptr_arg(i64 %a) {
define i32 @ptrtoaddr_inttoptr_arg_addrsize(i32 %a) {
; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_arg_addrsize(
; CHECK-SAME: i32 [[A:%.*]]) {
-; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[A]] to i64
-; CHECK-NEXT: [[TOPTR:%.*]] = inttoptr i64 [[TMP1]] to ptr addrspace(1)
-; CHECK-NEXT: [[TOADDR:%.*]] = ptrtoaddr ptr addrspace(1) [[TOPTR]] to i32
-; CHECK-NEXT: ret i32 [[TOADDR]]
+; CHECK-NEXT: ret i32 [[A]]
;
%toptr = inttoptr i32 %a to ptr addrspace(1)
%toaddr = ptrtoaddr ptr addrspace(1) %toptr to i32
diff --git a/llvm/test/Transforms/InstCombine/select-safe-impliedcond-transforms.ll b/llvm/test/Transforms/InstCombine/select-safe-impliedcond-transforms.ll
index ba34930..bc988a9 100644
--- a/llvm/test/Transforms/InstCombine/select-safe-impliedcond-transforms.ll
+++ b/llvm/test/Transforms/InstCombine/select-safe-impliedcond-transforms.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
; RUN: opt < %s -passes=instcombine -S | FileCheck %s
define i1 @a_true_implies_b_true(i8 %z, i1 %X, i1 %Y) {
@@ -34,15 +34,15 @@ define <2 x i1> @a_true_implies_b_true_vec(i8 %z0, <2 x i1> %X, <2 x i1> %Y) {
ret <2 x i1> %res
}
-define i1 @a_true_implies_b_true2(i8 %z, i1 %X, i1 %Y) {
+define i1 @a_true_implies_b_true2(i8 %z, i1 %X, i1 %Y) !prof !0 {
; CHECK-LABEL: @a_true_implies_b_true2(
; CHECK-NEXT: [[A:%.*]] = icmp ugt i8 [[Z:%.*]], 20
-; CHECK-NEXT: [[RES:%.*]] = select i1 [[A]], i1 [[X:%.*]], i1 false
+; CHECK-NEXT: [[RES:%.*]] = select i1 [[A]], i1 [[X:%.*]], i1 false, !prof [[PROF1:![0-9]+]]
; CHECK-NEXT: ret i1 [[RES]]
;
%a = icmp ugt i8 %z, 20
%b = icmp ugt i8 %z, 10
- %sel = select i1 %b, i1 %X, i1 %Y
+ %sel = select i1 %b, i1 %X, i1 %Y, !prof !1
%res = and i1 %a, %sel
ret i1 %res
}
@@ -258,3 +258,10 @@ define i1 @neg_icmp_eq_implies_trunc(i8 %x, i1 %c) {
%sel2 = select i1 %cmp, i1 true, i1 %sel1
ret i1 %sel2
}
+
+!0 = !{!"function_entry_count", i64 1000}
+!1 = !{!"branch_weights", i32 2, i32 3}
+;.
+; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 2, i32 3}
+;.
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/WebAssembly/any_all_true.ll b/llvm/test/Transforms/InstSimplify/ConstProp/WebAssembly/any_all_true.ll
index 7b30edb..71dad41 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/WebAssembly/any_all_true.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/WebAssembly/any_all_true.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -passes=instsimplify -S < %s | FileCheck %s
+; RUN: opt -passes=instsimplify -use-constant-int-for-fixed-length-splat -S < %s | FileCheck %s
; Test that intrinsics wasm call are constant folded
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/bitcount.ll b/llvm/test/Transforms/InstSimplify/ConstProp/bitcount.ll
index 68b45a94..f68b85e 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/bitcount.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/bitcount.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
+; RUN: opt < %s -passes=instsimplify -use-constant-int-for-fixed-length-splat -use-constant-int-for-scalable-splat -S | FileCheck %s
declare i31 @llvm.ctpop.i31(i31 %val)
declare i32 @llvm.cttz.i32(i32 %val, i1)
@@ -120,6 +121,22 @@ define <2 x i31> @ctpop_vector() {
ret <2 x i31> %x
}
+define <2 x i31> @ctpop_vector_splat_v2i31() {
+; CHECK-LABEL: @ctpop_vector_splat_v2i31(
+; CHECK-NEXT: ret <2 x i31> splat (i31 1)
+;
+ %x = call <2 x i31> @llvm.ctpop.v2i31(<2 x i31> splat(i31 16))
+ ret <2 x i31> %x
+}
+
+define <vscale x 2 x i31> @ctpop_vector_splat_nxv2i31() {
+; CHECK-LABEL: @ctpop_vector_splat_nxv2i31(
+; CHECK-NEXT: ret <vscale x 2 x i31> splat (i31 1)
+;
+ %x = call <vscale x 2 x i31> @llvm.ctpop.nxv2i31(<vscale x 2 x i31> splat(i31 16))
+ ret <vscale x 2 x i31> %x
+}
+
define <2 x i31> @ctpop_vector_undef() {
; CHECK-LABEL: @ctpop_vector_undef(
; CHECK-NEXT: ret <2 x i31> zeroinitializer
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/bitreverse.ll b/llvm/test/Transforms/InstSimplify/ConstProp/bitreverse.ll
new file mode 100644
index 0000000..409141a
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/bitreverse.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
+; RUN: opt < %s -passes=instsimplify -use-constant-int-for-fixed-length-splat -use-constant-int-for-scalable-splat -S | FileCheck %s
+
+define i16 @W() {
+; CHECK-LABEL: define i16 @W() {
+; CHECK-NEXT: ret i16 -32768
+;
+ %Z = call i16 @llvm.bitreverse.i16(i16 1)
+ ret i16 %Z
+}
+
+define i32 @X() {
+; CHECK-LABEL: define i32 @X() {
+; CHECK-NEXT: ret i32 -2147483648
+;
+ %Z = call i32 @llvm.bitreverse.i32(i32 1)
+ ret i32 %Z
+}
+
+define i64 @Y() {
+; CHECK-LABEL: define i64 @Y() {
+; CHECK-NEXT: ret i64 -9223372036854775808
+;
+ %Z = call i64 @llvm.bitreverse.i64(i64 1)
+ ret i64 %Z
+}
+
+define i80 @Z() {
+; CHECK-LABEL: define i80 @Z() {
+; CHECK-NEXT: ret i80 23777929115895377691656
+;
+ %Z = call i80 @llvm.bitreverse.i80(i80 76151636403560493650080)
+ ret i80 %Z
+}
+
+define <4 x i32> @bitreverse_splat_v4i32() {
+; CHECK-LABEL: define <4 x i32> @bitreverse_splat_v4i32() {
+; CHECK-NEXT: ret <4 x i32> splat (i32 -2147483648)
+;
+ %Z = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> splat(i32 1))
+ ret <4 x i32> %Z
+}
+
+define <vscale x 4 x i32> @bitreverse_splat_nxv4i32() {
+; CHECK-LABEL: define <vscale x 4 x i32> @bitreverse_splat_nxv4i32() {
+; CHECK-NEXT: ret <vscale x 4 x i32> splat (i32 -2147483648)
+;
+ %Z = call <vscale x 4 x i32> @llvm.bitreverse.v4i32(<vscale x 4 x i32> splat(i32 1))
+ ret <vscale x 4 x i32> %Z
+}
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/bswap.ll b/llvm/test/Transforms/InstSimplify/ConstProp/bswap.ll
index 42bb733..4db8ced 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/bswap.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/bswap.ll
@@ -2,6 +2,7 @@
; bswap should be constant folded when it is passed a constant argument
; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
+; RUN: opt < %s -passes=instsimplify -use-constant-int-for-fixed-length-splat -use-constant-int-for-scalable-splat -S | FileCheck %s
declare i16 @llvm.bswap.i16(i16)
@@ -42,3 +43,19 @@ define i80 @Z() {
%Z = call i80 @llvm.bswap.i80( i80 76151636403560493650080 )
ret i80 %Z
}
+
+define <4 x i32> @bswap_splat_v4i32() {
+; CHECK-LABEL: define <4 x i32> @bswap_splat_v4i32() {
+; CHECK-NEXT: ret <4 x i32> splat (i32 16777216)
+;
+ %Z = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> splat(i32 1))
+ ret <4 x i32> %Z
+}
+
+define <vscale x 4 x i32> @bswap_splat_nxv4i32() {
+; CHECK-LABEL: define <vscale x 4 x i32> @bswap_splat_nxv4i32() {
+; CHECK-NEXT: ret <vscale x 4 x i32> splat (i32 16777216)
+;
+ %Z = call <vscale x 4 x i32> @llvm.bswap.v4i32(<vscale x 4 x i32> splat(i32 1))
+ ret <vscale x 4 x i32> %Z
+}
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll
index e994921..9f9e3f9 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
+; RUN: opt < %s -passes=instsimplify -use-constant-int-for-fixed-length-splat -S | FileCheck %s
declare i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a)
declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a)
diff --git a/llvm/test/Transforms/InstSimplify/ptrmask.ll b/llvm/test/Transforms/InstSimplify/ptrmask.ll
index 5e7c636..a3483af 100644
--- a/llvm/test/Transforms/InstSimplify/ptrmask.ll
+++ b/llvm/test/Transforms/InstSimplify/ptrmask.ll
@@ -158,6 +158,26 @@ define ptr addrspace(1) @ptrmask_simplify_ptrmask_i32(ptr addrspace(1) %p) {
ret ptr addrspace(1) %r
}
+define ptr @ptrmask_simplify_ptrtoaddr(ptr %p) {
+; CHECK-LABEL: define ptr @ptrmask_simplify_ptrtoaddr
+; CHECK-SAME: (ptr [[P:%.*]]) {
+; CHECK-NEXT: ret ptr [[P]]
+;
+ %m = ptrtoaddr ptr %p to i64
+ %r = call ptr @llvm.ptrmask.p0.i64(ptr %p, i64 %m)
+ ret ptr %r
+}
+
+define ptr addrspace(1) @ptrmask_simplify_ptrtoaddr_i32(ptr addrspace(1) %p) {
+; CHECK-LABEL: define ptr addrspace(1) @ptrmask_simplify_ptrtoaddr_i32
+; CHECK-SAME: (ptr addrspace(1) [[P:%.*]]) {
+; CHECK-NEXT: ret ptr addrspace(1) [[P]]
+;
+ %m = ptrtoaddr ptr addrspace(1) %p to i32
+ %r = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) %p, i32 %m)
+ ret ptr addrspace(1) %r
+}
+
define ptr @ptrmask_simplify_aligned_unused(ptr align 64 %p) {
; CHECK-LABEL: define ptr @ptrmask_simplify_aligned_unused
; CHECK-SAME: (ptr align 64 [[P:%.*]]) {
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
index ed9fba3..22ab79d 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
@@ -289,6 +289,225 @@ define void @interleave_wide_nxdouble_factor2(ptr %ptr, <vscale x 4 x double> %l
ret void
}
+define void @deinterleave1_nxi64_factor3(ptr %ptr, <vscale x 4 x i64>* %s1, <vscale x 4 x i64>* %s2, <vscale x 4 x i64>* %s3) #0 {
+; CHECK-LABEL: define void @deinterleave1_nxi64_factor3
+; CHECK-SAME: (ptr [[PTR:%.*]], ptr [[S1:%.*]], ptr [[S2:%.*]], ptr [[S3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 0
+; CHECK-NEXT: [[LDN1:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> splat (i1 true), ptr [[TMP1]])
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 1
+; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 2
+; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP6]], i64 0)
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 3
+; CHECK-NEXT: [[LDN2:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> splat (i1 true), ptr [[TMP8]])
+; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN2]], 0
+; CHECK-NEXT: [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP9]], i64 2)
+; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN2]], 1
+; CHECK-NEXT: [[TMP12:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP11]], i64 2)
+; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN2]], 2
+; CHECK-NEXT: [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP13]], i64 2)
+; CHECK-NEXT: [[TMP15:%.*]] = insertvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } poison, <vscale x 4 x i64> [[TMP10]], 0
+; CHECK-NEXT: [[TMP16:%.*]] = insertvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } [[TMP15]], <vscale x 4 x i64> [[TMP12]], 1
+; CHECK-NEXT: [[TMP17:%.*]] = insertvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } [[TMP16]], <vscale x 4 x i64> [[TMP14]], 2
+; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } [[TMP17]], 0
+; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } [[TMP17]], 1
+; CHECK-NEXT: [[TMP20:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } [[TMP17]], 2
+; CHECK-NEXT: store <vscale x 4 x i64> [[TMP18]], ptr [[S1]], align 32
+; CHECK-NEXT: store <vscale x 4 x i64> [[TMP19]], ptr [[S2]], align 32
+; CHECK-NEXT: store <vscale x 4 x i64> [[TMP20]], ptr [[S3]], align 32
+; CHECK-NEXT: ret void
+;
+ %wide.vec = load <vscale x 12 x i64>, ptr %ptr, align 8
+ %ldN = tail call { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave3.nxv12i64(<vscale x 12 x i64> %wide.vec)
+
+ %3 = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } %ldN, 0
+ %4 = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } %ldN, 1
+ %5 = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } %ldN, 2
+
+ store <vscale x 4 x i64> %3, <vscale x 4 x i64>* %s1
+ store <vscale x 4 x i64> %4, <vscale x 4 x i64>* %s2
+ store <vscale x 4 x i64> %5, <vscale x 4 x i64>* %s3
+ ret void
+}
+
+define void @deinterleave2_nxi64_factor3(ptr %ptr, <vscale x 8 x i64>* %s1, <vscale x 8 x i64>* %s2, <vscale x 8 x i64>* %s3) #0 {
+; CHECK-LABEL: define void @deinterleave2_nxi64_factor3
+; CHECK-SAME: (ptr [[PTR:%.*]], ptr [[S1:%.*]], ptr [[S2:%.*]], ptr [[S3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 0
+; CHECK-NEXT: [[LDN1:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> splat (i1 true), ptr [[TMP1]])
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 1
+; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 2
+; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP6]], i64 0)
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 3
+; CHECK-NEXT: [[LDN2:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> splat (i1 true), ptr [[TMP8]])
+; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN2]], 0
+; CHECK-NEXT: [[TMP10:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP9]], i64 2)
+; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN2]], 1
+; CHECK-NEXT: [[TMP12:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP11]], i64 2)
+; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN2]], 2
+; CHECK-NEXT: [[TMP14:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP13]], i64 2)
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 6
+; CHECK-NEXT: [[LDN3:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> splat (i1 true), ptr [[TMP15]])
+; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN3]], 0
+; CHECK-NEXT: [[TMP17:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP10]], <vscale x 2 x i64> [[TMP16]], i64 4)
+; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN3]], 1
+; CHECK-NEXT: [[TMP19:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP12]], <vscale x 2 x i64> [[TMP18]], i64 4)
+; CHECK-NEXT: [[TMP20:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN3]], 2
+; CHECK-NEXT: [[TMP21:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP14]], <vscale x 2 x i64> [[TMP20]], i64 4)
+; CHECK-NEXT: [[TMP22:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 9
+; CHECK-NEXT: [[LDN4:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> splat (i1 true), ptr [[TMP22]])
+; CHECK-NEXT: [[TMP23:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN4]], 0
+; CHECK-NEXT: [[TMP24:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP17]], <vscale x 2 x i64> [[TMP23]], i64 6)
+; CHECK-NEXT: [[TMP25:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN4]], 1
+; CHECK-NEXT: [[TMP26:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP19]], <vscale x 2 x i64> [[TMP25]], i64 6)
+; CHECK-NEXT: [[TMP27:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN4]], 2
+; CHECK-NEXT: [[TMP28:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP21]], <vscale x 2 x i64> [[TMP27]], i64 6)
+; CHECK-NEXT: [[TMP29:%.*]] = insertvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } poison, <vscale x 8 x i64> [[TMP24]], 0
+; CHECK-NEXT: [[TMP30:%.*]] = insertvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } [[TMP29]], <vscale x 8 x i64> [[TMP26]], 1
+; CHECK-NEXT: [[TMP31:%.*]] = insertvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } [[TMP30]], <vscale x 8 x i64> [[TMP28]], 2
+; CHECK-NEXT: [[TMP32:%.*]] = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } [[TMP31]], 0
+; CHECK-NEXT: [[TMP33:%.*]] = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } [[TMP31]], 1
+; CHECK-NEXT: [[TMP34:%.*]] = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } [[TMP31]], 2
+; CHECK-NEXT: store <vscale x 8 x i64> [[TMP32]], ptr [[S1]], align 64
+; CHECK-NEXT: store <vscale x 8 x i64> [[TMP33]], ptr [[S2]], align 64
+; CHECK-NEXT: store <vscale x 8 x i64> [[TMP34]], ptr [[S3]], align 64
+; CHECK-NEXT: ret void
+;
+ %wide.vec = load <vscale x 24 x i64>, ptr %ptr, align 8
+ %ldN = tail call { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } @llvm.vector.deinterleave3.nxv24i64(<vscale x 24 x i64> %wide.vec)
+
+ %3 = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } %ldN, 0
+ %4 = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } %ldN, 1
+ %5 = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } %ldN, 2
+
+ store <vscale x 8 x i64> %3, <vscale x 8 x i64>* %s1
+ store <vscale x 8 x i64> %4, <vscale x 8 x i64>* %s2
+ store <vscale x 8 x i64> %5, <vscale x 8 x i64>* %s3
+ ret void
+}
+
+define void @deinterleave_neg1_nxi64_factor3(ptr %ptr, <vscale x 1 x i64>* %s1, <vscale x 1 x i64>* %s2, <vscale x 1 x i64>* %s3) #0 {
+; CHECK-LABEL: define void @deinterleave_neg1_nxi64_factor3
+; CHECK-SAME: (ptr [[PTR:%.*]], ptr [[S1:%.*]], ptr [[S2:%.*]], ptr [[S3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 3 x i64>, ptr [[PTR]], align 8
+; CHECK-NEXT: [[LDN:%.*]] = tail call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave3.nxv3i64(<vscale x 3 x i64> [[WIDE_VEC]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[LDN]], 0
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[LDN]], 1
+; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[LDN]], 2
+; CHECK-NEXT: store <vscale x 1 x i64> [[TMP1]], ptr [[S1]], align 8
+; CHECK-NEXT: store <vscale x 1 x i64> [[TMP2]], ptr [[S2]], align 8
+; CHECK-NEXT: store <vscale x 1 x i64> [[TMP3]], ptr [[S3]], align 8
+; CHECK-NEXT: ret void
+;
+ %wide.vec = load <vscale x 3 x i64>, ptr %ptr, align 8
+ %ldN = tail call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave3.nxv3i64(<vscale x 3 x i64> %wide.vec)
+
+ %3 = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } %ldN, 0
+ %4 = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } %ldN, 1
+ %5 = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } %ldN, 2
+
+ store <vscale x 1 x i64> %3, <vscale x 1 x i64>* %s1
+ store <vscale x 1 x i64> %4, <vscale x 1 x i64>* %s2
+ store <vscale x 1 x i64> %5, <vscale x 1 x i64>* %s3
+ ret void
+}
+
+define void @deinterleave_neg2_nxi8_factor3(ptr %ptr, <vscale x 8 x i8>* %s1, <vscale x 8 x i8>* %s2, <vscale x 8 x i8>* %s3) #0 {
+; CHECK-LABEL: define void @deinterleave_neg2_nxi8_factor3
+; CHECK-SAME: (ptr [[PTR:%.*]], ptr [[S1:%.*]], ptr [[S2:%.*]], ptr [[S3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 24 x i8>, ptr [[PTR]], align 8
+; CHECK-NEXT: [[LDN:%.*]] = tail call { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave3.nxv24i8(<vscale x 24 x i8> [[WIDE_VEC]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[LDN]], 0
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[LDN]], 1
+; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[LDN]], 2
+; CHECK-NEXT: store <vscale x 8 x i8> [[TMP1]], ptr [[S1]], align 8
+; CHECK-NEXT: store <vscale x 8 x i8> [[TMP2]], ptr [[S2]], align 8
+; CHECK-NEXT: store <vscale x 8 x i8> [[TMP3]], ptr [[S3]], align 8
+; CHECK-NEXT: ret void
+;
+ %wide.vec = load <vscale x 24 x i8>, ptr %ptr, align 8
+ %ldN = tail call { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave3.nxv12i8(<vscale x 24 x i8> %wide.vec)
+
+ %3 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %ldN, 0
+ %4 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %ldN, 1
+ %5 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %ldN, 2
+
+ store <vscale x 8 x i8> %3, <vscale x 8 x i8>* %s1
+ store <vscale x 8 x i8> %4, <vscale x 8 x i8>* %s2
+ store <vscale x 8 x i8> %5, <vscale x 8 x i8>* %s3
+ ret void
+}
+
+define void @interleave1_nxi64_factor3(ptr %ptr, <vscale x 8 x i64> %s1, <vscale x 8 x i64> %s2, <vscale x 8 x i64> %s3) #0 {
+; CHECK-LABEL: define void @interleave1_nxi64_factor3
+; CHECK-SAME: (ptr [[PTR:%.*]], <vscale x 8 x i64> [[S1:%.*]], <vscale x 8 x i64> [[S2:%.*]], <vscale x 8 x i64> [[S3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 0
+; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S1]], i64 0)
+; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S2]], i64 0)
+; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S3]], i64 0)
+; CHECK-NEXT: call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> splat (i1 true), ptr [[TMP1]])
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 3
+; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S1]], i64 2)
+; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S2]], i64 2)
+; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S3]], i64 2)
+; CHECK-NEXT: call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], <vscale x 2 x i1> splat (i1 true), ptr [[TMP5]])
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 6
+; CHECK-NEXT: [[TMP10:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S1]], i64 4)
+; CHECK-NEXT: [[TMP11:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S2]], i64 4)
+; CHECK-NEXT: [[TMP12:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S3]], i64 4)
+; CHECK-NEXT: call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP10]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[TMP12]], <vscale x 2 x i1> splat (i1 true), ptr [[TMP9]])
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 9
+; CHECK-NEXT: [[TMP14:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S1]], i64 6)
+; CHECK-NEXT: [[TMP15:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S2]], i64 6)
+; CHECK-NEXT: [[TMP16:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S3]], i64 6)
+; CHECK-NEXT: call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP14]], <vscale x 2 x i64> [[TMP15]], <vscale x 2 x i64> [[TMP16]], <vscale x 2 x i1> splat (i1 true), ptr [[TMP13]])
+; CHECK-NEXT: ret void
+;
+ %interleave = tail call <vscale x 24 x i64> @llvm.vector.interleave3.nxv24i64(<vscale x 8 x i64> %s1, <vscale x 8 x i64> %s2, <vscale x 8 x i64> %s3)
+
+ store <vscale x 24 x i64> %interleave, ptr %ptr, align 4
+ ret void
+}
+
+define void @interleave2_nxi64_factor3(ptr %ptr, <vscale x 4 x i64> %s1, <vscale x 4 x i64> %s2, <vscale x 4 x i64> %s3) #0 {
+; CHECK-LABEL: define void @interleave2_nxi64_factor3
+; CHECK-SAME: (ptr [[PTR:%.*]], <vscale x 4 x i64> [[S1:%.*]], <vscale x 4 x i64> [[S2:%.*]], <vscale x 4 x i64> [[S3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 0
+; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[S1]], i64 0)
+; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[S2]], i64 0)
+; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[S3]], i64 0)
+; CHECK-NEXT: call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> splat (i1 true), ptr [[TMP1]])
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 3
+; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[S1]], i64 2)
+; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[S2]], i64 2)
+; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[S3]], i64 2)
+; CHECK-NEXT: call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], <vscale x 2 x i1> splat (i1 true), ptr [[TMP5]])
+; CHECK-NEXT: ret void
+;
+ %interleave = tail call <vscale x 12 x i64> @llvm.vector.interleave3.nxv12i64(<vscale x 4 x i64> %s1, <vscale x 4 x i64> %s2, <vscale x 4 x i64> %s3)
+
+ store <vscale x 12 x i64> %interleave, ptr %ptr, align 4
+ ret void
+}
+
+define void @interleave_neg_nxi8_factor3(ptr %ptr, <vscale x 8 x i8> %s1, <vscale x 8 x i8> %s2, <vscale x 8 x i8> %s3) #0 {
+; CHECK-LABEL: define void @interleave_neg_nxi8_factor3
+; CHECK-SAME: (ptr [[PTR:%.*]], <vscale x 8 x i8> [[S1:%.*]], <vscale x 8 x i8> [[S2:%.*]], <vscale x 8 x i8> [[S3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[INTERLEAVE:%.*]] = tail call <vscale x 24 x i8> @llvm.vector.interleave3.nxv24i8(<vscale x 8 x i8> [[S1]], <vscale x 8 x i8> [[S2]], <vscale x 8 x i8> [[S3]])
+; CHECK-NEXT: store <vscale x 24 x i8> [[INTERLEAVE]], ptr [[PTR]], align 4
+; CHECK-NEXT: ret void
+;
+ %interleave = tail call <vscale x 24 x i8> @llvm.vector.interleave3.nxv24i8(<vscale x 8 x i8> %s1, <vscale x 8 x i8> %s2, <vscale x 8 x i8> %s3)
+
+ store <vscale x 24 x i8> %interleave, ptr %ptr, align 4
+ ret void
+}
+
declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
@@ -312,4 +531,15 @@ declare <vscale x 4 x ptr> @llvm.vector.interleave2.nxv4p0(<vscale x 2 x ptr>, <
; Larger interleaves to test 'legalization'
declare <vscale x 8 x double> @llvm.vector.interleave2.nxv8f64(<vscale x 4 x double>, <vscale x 4 x double>)
+; De-Interleaves with Factor=3
+declare { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave3.nxv3i64(<vscale x 3 x i64>)
+declare { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave3.nxv12i64(<vscale x 12 x i64>)
+declare { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } @llvm.vector.deinterleave3.nxv24i64(<vscale x 24 x i64>)
+declare { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave3.nxv24i8(<vscale x 24 x i8>)
+
+; Interleaves with Factor=3
+declare <vscale x 24 x i8> @llvm.vector.interleave3.nxv24i8(<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>)
+declare <vscale x 24 x i64> @llvm.vector.interleave3.nxv24i64(<vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64>)
+declare <vscale x 12 x i64> @llvm.vector.interleave3.nxv12i64(<vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64>)
+
attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
diff --git a/llvm/test/Transforms/LoopUnroll/scevunroll.ll b/llvm/test/Transforms/LoopUnroll/scevunroll.ll
index fa55eab..bc63f79 100644
--- a/llvm/test/Transforms/LoopUnroll/scevunroll.ll
+++ b/llvm/test/Transforms/LoopUnroll/scevunroll.ll
@@ -465,8 +465,7 @@ define void @peel_int_eq_condition(i32 %start) {
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT_PEEL]], [[ENTRY_PEEL_NEWPH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT: [[C_0:%.*]] = icmp eq i32 [[IV]], [[START]]
-; CHECK-NEXT: br i1 [[C_0]], label [[IF_THEN:%.*]], label [[LOOP_LATCH]]
+; CHECK-NEXT: br i1 false, label [[IF_THEN:%.*]], label [[LOOP_LATCH]]
; CHECK: if.then:
; CHECK-NEXT: call void @fn(i32 [[IV]])
; CHECK-NEXT: br label [[LOOP_LATCH]]
diff --git a/llvm/test/Transforms/LoopVectorize/12-12-11-if-conv.ll b/llvm/test/Transforms/LoopVectorize/12-12-11-if-conv.ll
index 279d4e8..83623fd 100644
--- a/llvm/test/Transforms/LoopVectorize/12-12-11-if-conv.ll
+++ b/llvm/test/Transforms/LoopVectorize/12-12-11-if-conv.ll
@@ -5,8 +5,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
;CHECK-LABEL: @foo(
;CHECK: icmp eq <4 x i32>
;CHECK: select <4 x i1>
-;CHECK: ret i32
-define i32 @foo(i32 %x, i32 %t, ptr nocapture %A) nounwind uwtable ssp {
+;CHECK: ret void
+define void @foo(i32 %x, i32 %t, ptr nocapture %A) nounwind uwtable ssp {
entry:
%cmp10 = icmp sgt i32 %x, 0
br i1 %cmp10, label %for.body, label %for.end
@@ -35,5 +35,5 @@ if.end: ; preds = %for.body, %if.then
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %if.end, %entry
- ret i32 undef
+ ret void
}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index 6cf11be..6fe6883 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -660,16 +660,17 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
; COMMON-NEXT: store i8 6, ptr [[TMP6]], align 1
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE12]]
; COMMON: [[PRED_STORE_CONTINUE12]]:
-; COMMON-NEXT: br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[EXIT1:.*]]
+; COMMON-NEXT: br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[EXIT:.*]]
; COMMON: [[PRED_STORE_IF13]]:
; COMMON-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 7
; COMMON-NEXT: store i8 7, ptr [[TMP7]], align 1
-; COMMON-NEXT: br label %[[EXIT1]]
+; COMMON-NEXT: br label %[[EXIT]]
+; COMMON: [[EXIT]]:
+; COMMON-NEXT: br label %[[SCALAR_PH:.*]]
+; COMMON: [[SCALAR_PH]]:
+; COMMON-NEXT: br label %[[EXIT1:.*]]
; COMMON: [[EXIT1]]:
-; COMMON-NEXT: br label %[[SCALAR_PH1:.*]]
-; COMMON: [[SCALAR_PH1]]:
-; COMMON-NEXT: br [[EXIT:label %.*]]
-; COMMON: [[SCALAR_PH:.*:]]
+; COMMON-NEXT: ret void
;
entry:
br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
index 93e71af..e3e4833 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
@@ -42,7 +42,7 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3
; CHECK-NEXT: br label %[[VEC_EPILOG_ITER_CHECK:.*]]
; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 1, [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
; CHECK: [[VEC_EPILOG_PH]]:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP13]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -80,7 +80,7 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3
; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX7]], [[TMP17]]
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT6]]
; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]:
; CHECK-NEXT: [[TMP37:%.*]] = call i64 @llvm.vector.reduce.or.nxv2i64(<vscale x 2 x i64> [[TMP35]])
; CHECK-NEXT: br label %[[VEC_EPILOG_SCALAR_PH]]
@@ -104,7 +104,7 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3
; CHECK-NEXT: [[RED_NEXT]] = or i64 [[RED]], [[MIN_EXT]]
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT_I_I:%.*]] = icmp eq i64 [[IV_NEXT]], 17
-; CHECK-NEXT: br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ]
; CHECK-NEXT: ret i64 [[RED_NEXT_LCSSA]]
@@ -167,13 +167,13 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no
; CHECK-NEXT: [[TMP11]] = or <16 x i64> [[VEC_PHI]], [[TMP10]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP11]])
; CHECK-NEXT: br label %[[VEC_EPILOG_ITER_CHECK:.*]]
; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 1, [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3]]
; CHECK: [[VEC_EPILOG_PH]]:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP13]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -211,7 +211,7 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no
; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX7]], [[TMP17]]
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT6]]
; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]:
; CHECK-NEXT: [[TMP37:%.*]] = call i64 @llvm.vector.reduce.or.nxv2i64(<vscale x 2 x i64> [[TMP35]])
; CHECK-NEXT: br label %[[VEC_EPILOG_SCALAR_PH]]
@@ -235,7 +235,7 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no
; CHECK-NEXT: [[RED_NEXT]] = or i64 [[RED]], [[MIN_EXT]]
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT_I_I:%.*]] = icmp eq i64 [[IV_NEXT]], 17
-; CHECK-NEXT: br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ]
; CHECK-NEXT: ret i64 [[RED_NEXT_LCSSA]]
@@ -308,7 +308,7 @@ define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks(
; CHECK-NEXT: store i64 0, ptr [[L]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
-; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[SCALAR_PH:.*]]
; CHECK: [[SCALAR_PH]]:
@@ -332,7 +332,7 @@ define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks(
; CHECK-NEXT: store i64 0, ptr [[L]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV1]], 2
; CHECK-NEXT: [[EC:%.*]] = icmp ult i64 [[IV1]], 14
-; CHECK-NEXT: br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT: br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP11:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
index e424649..75b18ff 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
@@ -541,3 +541,22 @@ exit: ; preds = %for.body
; CHECK-VS1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
; CHECK-VS1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
; CHECK-VS1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-VS1: [[PROF3]] = !{!"branch_weights", i32 8, i32 8}
+; CHECK-VS1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK-VS1: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]}
+; CHECK-VS1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK-VS1: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]}
+; CHECK-VS1: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK-VS1: [[PROF9]] = !{!"branch_weights", i32 10, i32 30}
+;.
+; CHECK-VS2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-VS2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-VS2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-VS2: [[PROF3]] = !{!"branch_weights", i32 8, i32 8}
+; CHECK-VS2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK-VS2: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]}
+; CHECK-VS2: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK-VS2: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]}
+; CHECK-VS2: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK-VS2: [[PROF9]] = !{!"branch_weights", i32 10, i32 30}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll
index a6e0f8a..300f5d9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll
@@ -40,6 +40,7 @@ define void @sincos_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noali
; CHECK-ARMPL: [[ENTRY:.*:]]
; CHECK-ARMPL: [[VECTOR_PH:.*:]]
; CHECK-ARMPL: [[VECTOR_BODY:.*:]]
+; CHECK-ARMPL: [[VECTOR_BODY1:.*:]]
; CHECK-ARMPL: [[TMP12:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD:%.*]])
; CHECK-ARMPL: [[TMP13:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD1:%.*]])
; CHECK-ARMPL: [[TMP14:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP12]], 0
@@ -53,6 +54,15 @@ define void @sincos_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noali
; CHECK-ARMPL: [[MIDDLE_BLOCK:.*:]]
; CHECK-ARMPL: [[SCALAR_PH:.*:]]
; CHECK-ARMPL: [[FOR_BODY:.*:]]
+; CHECK-ARMPL: [[VEC_EPILOG_VECTOR_BODY:.*:]]
+; CHECK-ARMPL: [[TMP29:%.*]] = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> [[WIDE_LOAD3:%.*]])
+; CHECK-ARMPL: [[TMP25:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 0
+; CHECK-ARMPL: [[TMP26:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 1
+; CHECK-ARMPL: store <4 x float> [[TMP25]], ptr [[TMP30:%.*]], align 4
+; CHECK-ARMPL: store <4 x float> [[TMP26]], ptr [[TMP28:%.*]], align 4
+; CHECK-ARMPL: [[VEC_EPILOG_MIDDLE_BLOCK:.*:]]
+; CHECK-ARMPL: [[VEC_EPILOG_SCALAR_PH:.*:]]
+; CHECK-ARMPL: [[FOR_BODY1:.*:]]
; CHECK-ARMPL: [[CALL:%.*]] = tail call { float, float } @llvm.sincos.f32(float [[IN_VAL:%.*]])
; CHECK-ARMPL: [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0
; CHECK-ARMPL: [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1
@@ -262,6 +272,7 @@ define void @modf_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias
; CHECK-ARMPL: [[ENTRY:.*:]]
; CHECK-ARMPL: [[VECTOR_PH:.*:]]
; CHECK-ARMPL: [[VECTOR_BODY:.*:]]
+; CHECK-ARMPL: [[VECTOR_BODY1:.*:]]
; CHECK-ARMPL: [[TMP12:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.modf.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD:%.*]])
; CHECK-ARMPL: [[TMP13:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.modf.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD1:%.*]])
; CHECK-ARMPL: [[TMP14:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP12]], 0
@@ -275,6 +286,15 @@ define void @modf_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias
; CHECK-ARMPL: [[MIDDLE_BLOCK:.*:]]
; CHECK-ARMPL: [[SCALAR_PH:.*:]]
; CHECK-ARMPL: [[FOR_BODY:.*:]]
+; CHECK-ARMPL: [[VEC_EPILOG_VECTOR_BODY:.*:]]
+; CHECK-ARMPL: [[TMP29:%.*]] = call { <4 x float>, <4 x float> } @llvm.modf.v4f32(<4 x float> [[WIDE_LOAD3:%.*]])
+; CHECK-ARMPL: [[TMP25:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 0
+; CHECK-ARMPL: [[TMP26:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 1
+; CHECK-ARMPL: store <4 x float> [[TMP25]], ptr [[TMP30:%.*]], align 4
+; CHECK-ARMPL: store <4 x float> [[TMP26]], ptr [[TMP28:%.*]], align 4
+; CHECK-ARMPL: [[VEC_EPILOG_MIDDLE_BLOCK:.*:]]
+; CHECK-ARMPL: [[VEC_EPILOG_SCALAR_PH:.*:]]
+; CHECK-ARMPL: [[FOR_BODY1:.*:]]
; CHECK-ARMPL: [[CALL:%.*]] = tail call { float, float } @llvm.modf.f32(float [[IN_VAL:%.*]])
; CHECK-ARMPL: [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0
; CHECK-ARMPL: [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1
@@ -412,6 +432,7 @@ define void @sincospi_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noa
; CHECK-ARMPL: [[ENTRY:.*:]]
; CHECK-ARMPL: [[VECTOR_PH:.*:]]
; CHECK-ARMPL: [[VECTOR_BODY:.*:]]
+; CHECK-ARMPL: [[VECTOR_BODY1:.*:]]
; CHECK-ARMPL: [[TMP12:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincospi.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD:%.*]])
; CHECK-ARMPL: [[TMP13:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincospi.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD1:%.*]])
; CHECK-ARMPL: [[TMP14:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP12]], 0
@@ -425,6 +446,15 @@ define void @sincospi_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noa
; CHECK-ARMPL: [[MIDDLE_BLOCK:.*:]]
; CHECK-ARMPL: [[SCALAR_PH:.*:]]
; CHECK-ARMPL: [[FOR_BODY:.*:]]
+; CHECK-ARMPL: [[VEC_EPILOG_VECTOR_BODY:.*:]]
+; CHECK-ARMPL: [[TMP29:%.*]] = call { <4 x float>, <4 x float> } @llvm.sincospi.v4f32(<4 x float> [[WIDE_LOAD3:%.*]])
+; CHECK-ARMPL: [[TMP25:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 0
+; CHECK-ARMPL: [[TMP26:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 1
+; CHECK-ARMPL: store <4 x float> [[TMP25]], ptr [[TMP30:%.*]], align 4
+; CHECK-ARMPL: store <4 x float> [[TMP26]], ptr [[TMP28:%.*]], align 4
+; CHECK-ARMPL: [[VEC_EPILOG_MIDDLE_BLOCK:.*:]]
+; CHECK-ARMPL: [[VEC_EPILOG_SCALAR_PH:.*:]]
+; CHECK-ARMPL: [[FOR_BODY1:.*:]]
; CHECK-ARMPL: [[CALL:%.*]] = tail call { float, float } @llvm.sincospi.f32(float [[IN_VAL:%.*]])
; CHECK-ARMPL: [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0
; CHECK-ARMPL: [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index 3a88273..56a5663 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -2697,4 +2697,4 @@ for.body: ; preds = %for.body.lr.ph, %fo
!9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
!10 = !{!"llvm.loop.vectorize.enable", i1 true}
attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
-attributes #1 = { vscale_range(1,16) "target-features"="+neon,+dotprod,+sve" "cpu"="neoverse-v2" }
+attributes #1 = { vscale_range(1,16) "target-features"="+neon,+dotprod,+sve" "target-cpu"="neoverse-v2" }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
index 3c2ae1c7..1e6bcb1 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
@@ -410,20 +410,32 @@ define i32 @zext_add_reduc_i8_i32_has_neon_dotprod(ptr %a) #1 {
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 32
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]])
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP5]])
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE8]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP10]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE9]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP7]])
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE3]], [[PARTIAL_REDUCE]]
-; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[PARTIAL_REDUCE8]], [[BIN_RDX]]
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[PARTIAL_REDUCE9]], [[BIN_RDX10]]
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
; CHECK-INTERLEAVED-NEXT: br label [[SCALAR_PH:%.*]]
; CHECK-INTERLEAVED: scalar.ph:
;
@@ -432,25 +444,20 @@ define i32 @zext_add_reduc_i8_i32_has_neon_dotprod(ptr %a) #1 {
; CHECK-MAXBW-NEXT: entry:
; CHECK-MAXBW-NEXT: br label [[VECTOR_PH:%.*]]
; CHECK-MAXBW: vector.ph:
-; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
-; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
-; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-MAXBW: vector.body:
; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1
-; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
-; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP9]])
-; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP1]])
+; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-MAXBW-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK-MAXBW: middle.block:
-; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]])
-; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]]
-; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
+; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-MAXBW-NEXT: br label [[SCALAR_PH:%.*]]
; CHECK-MAXBW: scalar.ph:
;
entry:
@@ -693,20 +700,32 @@ define i32 @zext_sub_reduc_i8_i32_has_neon_dotprod(ptr %a) #1 {
; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 32
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = zext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP6]] = sub <16 x i32> [[VEC_PHI]], [[TMP4]]
; CHECK-INTERLEAVED-NEXT: [[TMP7]] = sub <16 x i32> [[VEC_PHI1]], [[TMP5]]
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT: [[TMP10]] = sub <16 x i32> [[VEC_PHI2]], [[TMP12]]
+; CHECK-INTERLEAVED-NEXT: [[TMP11]] = sub <16 x i32> [[VEC_PHI3]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP7]], [[TMP6]]
-; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX7:%.*]] = add <16 x i32> [[TMP10]], [[BIN_RDX]]
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX8:%.*]] = add <16 x i32> [[TMP11]], [[BIN_RDX7]]
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX8]])
; CHECK-INTERLEAVED-NEXT: br label [[SCALAR_PH:%.*]]
; CHECK-INTERLEAVED: scalar.ph:
;
@@ -1093,9 +1112,124 @@ exit:
ret i32 %add.lcssa
}
+define i64 @sext_reduction_i32_to_i64(ptr %arr, i64 %n) #1 {
+; CHECK-INTERLEAVE1-LABEL: define i64 @sext_reduction_i32_to_i64(
+; CHECK-INTERLEAVE1-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR2]] {
+; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
+; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 2
+; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1: vector.ph:
+; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 2
+; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1: vector.body:
+; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[INDEX]]
+; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
+; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = sext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
+; CHECK-INTERLEAVE1-NEXT: [[TMP2]] = add <2 x i64> [[VEC_PHI]], [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-INTERLEAVE1: middle.block:
+; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP2]])
+; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1: scalar.ph:
+;
+; CHECK-INTERLEAVED-LABEL: define i64 @sext_reduction_i32_to_i64(
+; CHECK-INTERLEAVED-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR2]] {
+; CHECK-INTERLEAVED-NEXT: entry:
+; CHECK-INTERLEAVED-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
+; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 8
+; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED: vector.ph:
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 8
+; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED: vector.body:
+; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 2
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 4
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 6
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = sext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <2 x i32> [[WIDE_LOAD4]] to <2 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = sext <2 x i32> [[WIDE_LOAD5]] to <2 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = sext <2 x i32> [[WIDE_LOAD6]] to <2 x i64>
+; CHECK-INTERLEAVED-NEXT: [[TMP8]] = add <2 x i64> [[VEC_PHI]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT: [[TMP9]] = add <2 x i64> [[VEC_PHI1]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT: [[TMP10]] = add <2 x i64> [[VEC_PHI2]], [[TMP6]]
+; CHECK-INTERLEAVED-NEXT: [[TMP11]] = add <2 x i64> [[VEC_PHI3]], [[TMP7]]
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-INTERLEAVED: middle.block:
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP9]], [[TMP8]]
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX7:%.*]] = add <2 x i64> [[TMP10]], [[BIN_RDX]]
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX8:%.*]] = add <2 x i64> [[TMP11]], [[BIN_RDX7]]
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX8]])
+; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED: scalar.ph:
+;
+; CHECK-MAXBW-LABEL: define i64 @sext_reduction_i32_to_i64(
+; CHECK-MAXBW-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR2]] {
+; CHECK-MAXBW-NEXT: entry:
+; CHECK-MAXBW-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
+; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 2
+; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW: vector.ph:
+; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 2
+; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW: vector.body:
+; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[INDEX]]
+; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
+; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = sext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
+; CHECK-MAXBW-NEXT: [[TMP2]] = add <2 x i64> [[VEC_PHI]], [[TMP1]]
+; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-MAXBW: middle.block:
+; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP2]])
+; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
+; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-MAXBW: scalar.ph:
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %acc = phi i64 [ 0, %entry ], [ %add, %loop ]
+ %gep = getelementptr inbounds i32, ptr %arr, i64 %iv
+ %load = load i32, ptr %gep
+ %sext = sext i32 %load to i64
+ %add = add i64 %acc, %sext
+ %iv.next = add i64 %iv, 1
+ %cmp = icmp ult i64 %iv.next, %n
+ br i1 %cmp, label %loop, label %exit
+
+exit:
+ ret i64 %add
+}
+
!0 = distinct !{!0, !1}
!1 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
-attributes #1 = { vscale_range(1,16) "target-features"="+neon,+dotprod,+sve" "cpu"="neoverse-v2" }
+attributes #1 = { vscale_range(1,16) "target-features"="+neon,+dotprod,+sve" "target-cpu"="neoverse-v2" }
attributes #2 = { "target-features"="+neon,+dotprod" }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
index ab9b48f..aff2c4c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
@@ -153,17 +153,20 @@ define void @uniform_gep_for_replicating_gep(ptr %dst) {
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 2
-; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <2 x i32> [[STEP_ADD]], zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <2 x i32> [[VEC_IND]], zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <2 x i32> [[STEP_ADD]], zeroinitializer
; CHECK-NEXT: [[TMP8:%.*]] = lshr i32 [[INDEX]], 1
; CHECK-NEXT: [[TMP9:%.*]] = lshr i32 [[TMP2]], 1
; CHECK-NEXT: [[TMP11:%.*]] = zext <2 x i1> [[TMP5]] to <2 x i8>
+; CHECK-NEXT: [[TMP6:%.*]] = zext <2 x i1> [[TMP3]] to <2 x i8>
; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP8]] to i64
; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP9]] to i64
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP14]]
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP15]]
; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i8> [[TMP11]], i32 1
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i8> [[TMP6]], i32 1
; CHECK-NEXT: store i8 [[TMP22]], ptr [[TMP18]], align 1
-; CHECK-NEXT: store i8 [[TMP22]], ptr [[TMP19]], align 1
+; CHECK-NEXT: store i8 [[TMP12]], ptr [[TMP19]], align 1
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], splat (i32 2)
; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll
index 8830ce3..5f79d02 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll
@@ -38,8 +38,9 @@ define void @struct_return_f32_widen(ptr noalias %in, ptr noalias writeonly %out
; CHECK-NEXT: [[TMP14:%.*]] = xor i1 [[TMP13]], true
; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: br [[EXIT:label %.*]]
-; CHECK: [[SCALAR_PH:.*:]]
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
;
entry:
br label %for.body
@@ -96,8 +97,9 @@ define void @struct_return_f64_widen(ptr noalias %in, ptr noalias writeonly %out
; CHECK-NEXT: [[TMP14:%.*]] = xor i1 [[TMP13]], true
; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: br [[EXIT:label %.*]]
-; CHECK: [[SCALAR_PH:.*:]]
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
;
entry:
br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll
index d447517..f03f743 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll
@@ -29,8 +29,9 @@ define void @trip7_i64(ptr noalias nocapture noundef %dst, ptr noalias nocapture
; CHECK-NEXT: [[COND:%.*]] = xor i1 [[EXTRACT_FIRST_LANE_MASK]], true
; CHECK-NEXT: br i1 [[COND]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: br [[EXIT:label %.*]]
-; CHECK: [[SCALAR_PH:.*:]]
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
;
entry:
br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
index b8f4e84..753847f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
@@ -33,8 +33,9 @@ define void @uniform_load(ptr noalias %dst, ptr noalias readonly %src, i64 %n) #
; CHECK-NEXT: [[FIRST_LANE_SET:%.*]] = xor i1 [[EXTRACT_FIRST_LANE_MASK]], true
; CHECK-NEXT: br i1 [[FIRST_LANE_SET]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: br [[FOR_END:label %.*]]
-; CHECK: [[SCALAR_PH:.*:]]
+; CHECK-NEXT: br label %[[FOR_END:.*]]
+; CHECK: [[FOR_END]]:
+; CHECK-NEXT: ret void
;
entry:
@@ -87,8 +88,9 @@ define void @cond_uniform_load(ptr noalias nocapture %dst, ptr nocapture readonl
; CHECK-NEXT: [[TMP9:%.*]] = xor i1 [[TMP8]], true
; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: br [[FOR_END:label %.*]]
-; CHECK: [[SCALAR_PH:.*:]]
+; CHECK-NEXT: br label %[[FOR_END:.*]]
+; CHECK: [[FOR_END]]:
+; CHECK-NEXT: ret void
;
entry:
br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
index 596e42e..d0c1194 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
@@ -36,7 +36,7 @@ for.body: ; preds = %entry, %for.body
br i1 %exitcond.not, label %exit, label %for.body
}
-define i32 @if_convert(ptr %a, ptr %b, i32 %start, i32 %end) #0 {
+define void @if_convert(ptr %a, ptr %b, i32 %start, i32 %end) #0 {
; CHECK-COST-2: LV: Found an estimated cost of 0 for VF 1 For instruction: %i.032 = phi i32 [ %inc, %if.end ], [ %start, %for.body.preheader ]
; CHECK-COST-2-NEXT: LV: Found an estimated cost of 0 for VF 1 For instruction: %arrayidx = getelementptr inbounds i32, ptr %a, i32 %i.032
@@ -70,7 +70,7 @@ for.cond.cleanup.loopexit: ; preds = %if.end
br label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- ret i32 undef
+ ret void
for.body: ; preds = %for.body.preheader, %if.end
%i.032 = phi i32 [ %inc, %if.end ], [ %start, %for.body.preheader ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll
index e046816..e84c0d6 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll
@@ -67,7 +67,7 @@ define void @test_may_clobber(ptr %p) {
; CHECK-NEXT: store <4 x i64> [[WIDE_LOAD]], ptr [[TMP4]], align 32
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: exit:
@@ -111,7 +111,7 @@ define void @trivial_due_max_vscale(ptr %p) {
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]]
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: exit:
@@ -155,7 +155,7 @@ define void @no_high_lmul_or_interleave(ptr %p) {
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]]
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: exit:
@@ -207,7 +207,7 @@ define void @safe_load_store_distance_not_pow_of_2(i64 %N) {
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 24)
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[SCALAR_PH]]
; CHECK: scalar.ph:
@@ -221,7 +221,7 @@ define void @safe_load_store_distance_not_pow_of_2(i64 %N) {
; CHECK-NEXT: store i16 0, ptr [[GEP_OFF]], align 2
; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 3
; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IV]], [[N]]
-; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: exit:
; CHECK-NEXT: ret void
;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
index 2fbc73e..c66d8d6 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
@@ -133,7 +133,7 @@ define void @single_constant_stride_int_iv(ptr %p) {
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: exit:
@@ -237,7 +237,7 @@ define void @single_constant_stride_ptr_iv(ptr %p) {
; CHECK-NEXT: [[TMP12:%.*]] = mul i64 8, [[TMP9]]
; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP12]]
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: exit:
@@ -346,7 +346,7 @@ define void @single_stride_int_scaled(ptr %p, i64 %stride) {
; NOSTRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]]
; NOSTRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
; NOSTRIDED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; NOSTRIDED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; NOSTRIDED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; NOSTRIDED: middle.block:
; NOSTRIDED-NEXT: br label [[EXIT:%.*]]
; NOSTRIDED: scalar.ph:
@@ -360,7 +360,7 @@ define void @single_stride_int_scaled(ptr %p, i64 %stride) {
; NOSTRIDED-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4
; NOSTRIDED-NEXT: [[NEXTI]] = add i64 [[I]], 1
; NOSTRIDED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
; NOSTRIDED: exit:
; NOSTRIDED-NEXT: ret void
;
@@ -468,7 +468,7 @@ define void @single_stride_int_iv(ptr %p, i64 %stride) {
; NOSTRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]]
; NOSTRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
; NOSTRIDED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; NOSTRIDED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; NOSTRIDED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; NOSTRIDED: middle.block:
; NOSTRIDED-NEXT: br label [[EXIT:%.*]]
; NOSTRIDED: scalar.ph:
@@ -483,7 +483,7 @@ define void @single_stride_int_iv(ptr %p, i64 %stride) {
; NOSTRIDED-NEXT: [[OFFSET_NEXT]] = add nuw nsw i64 [[OFFSET]], [[STRIDE]]
; NOSTRIDED-NEXT: [[NEXTI]] = add i64 [[I]], 1
; NOSTRIDED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
; NOSTRIDED: exit:
; NOSTRIDED-NEXT: ret void
;
@@ -640,7 +640,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
; NOSTRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP13]], [[INDEX]]
; NOSTRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]]
; NOSTRIDED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; NOSTRIDED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; NOSTRIDED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; NOSTRIDED: middle.block:
; NOSTRIDED-NEXT: br label [[EXIT:%.*]]
; NOSTRIDED: scalar.ph:
@@ -656,7 +656,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
; NOSTRIDED-NEXT: store i32 [[Y0]], ptr [[Q1]], align 4
; NOSTRIDED-NEXT: [[NEXTI]] = add i64 [[I]], 1
; NOSTRIDED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
; NOSTRIDED: exit:
; NOSTRIDED-NEXT: ret void
;
@@ -790,14 +790,14 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
; STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT9]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
; STRIDED-NEXT: [[TMP18:%.*]] = mul nuw nsw <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT1]]
; STRIDED-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[P]], <vscale x 4 x i64> [[TMP18]]
-; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP19]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP43]]), !alias.scope [[META6:![0-9]+]]
+; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP19]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP43]]), !alias.scope [[META5:![0-9]+]]
; STRIDED-NEXT: [[TMP20:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], splat (i32 1)
; STRIDED-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[P2]], <vscale x 4 x i64> [[TMP18]]
-; STRIDED-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x ptr> align 4 [[TMP21]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP43]]), !alias.scope [[META9:![0-9]+]], !noalias [[META6]]
+; STRIDED-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x ptr> align 4 [[TMP21]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP43]]), !alias.scope [[META8:![0-9]+]], !noalias [[META5]]
; STRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP44]]
; STRIDED-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; STRIDED-NEXT: [[TMP41:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; STRIDED-NEXT: br i1 [[TMP41]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; STRIDED-NEXT: br i1 [[TMP41]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; STRIDED: middle.block:
; STRIDED-NEXT: br label [[EXIT:%.*]]
; STRIDED: scalar.ph:
@@ -813,7 +813,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
; STRIDED-NEXT: store i32 [[Y0]], ptr [[Q1]], align 4
; STRIDED-NEXT: [[NEXTI]] = add i64 [[I]], 1
; STRIDED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; STRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]]
+; STRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
; STRIDED: exit:
; STRIDED-NEXT: ret void
;
@@ -965,7 +965,7 @@ define void @double_stride_int_iv(ptr %p, ptr %p2, i64 %stride) {
; NOSTRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]]
; NOSTRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
; NOSTRIDED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; NOSTRIDED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; NOSTRIDED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
; NOSTRIDED: middle.block:
; NOSTRIDED-NEXT: br label [[EXIT:%.*]]
; NOSTRIDED: scalar.ph:
@@ -981,7 +981,7 @@ define void @double_stride_int_iv(ptr %p, ptr %p2, i64 %stride) {
; NOSTRIDED-NEXT: [[OFFSET_NEXT]] = add nuw nsw i64 [[OFFSET]], [[STRIDE]]
; NOSTRIDED-NEXT: [[NEXTI]] = add i64 [[I]], 1
; NOSTRIDED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; NOSTRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]]
; NOSTRIDED: exit:
; NOSTRIDED-NEXT: ret void
;
@@ -1145,16 +1145,16 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) {
; STRIDED-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI11]], <vscale x 4 x i64> [[TMP18]]
; STRIDED-NEXT: [[VECTOR_GEP7:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <vscale x 4 x i64> [[TMP18]]
; STRIDED-NEXT: [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
-; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[VECTOR_GEP7]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]]), !alias.scope [[META13:![0-9]+]]
+; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[VECTOR_GEP7]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]]), !alias.scope [[META12:![0-9]+]]
; STRIDED-NEXT: [[TMP30:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], splat (i32 1)
-; STRIDED-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP30]], <vscale x 4 x ptr> align 4 [[VECTOR_GEP]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]]), !alias.scope [[META16:![0-9]+]], !noalias [[META13]]
+; STRIDED-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP30]], <vscale x 4 x ptr> align 4 [[VECTOR_GEP]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]]), !alias.scope [[META15:![0-9]+]], !noalias [[META12]]
; STRIDED-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64
; STRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]]
; STRIDED-NEXT: [[TMP25:%.*]] = mul i64 [[STRIDE]], [[TMP16]]
; STRIDED-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP25]]
; STRIDED-NEXT: [[PTR_IND12]] = getelementptr i8, ptr [[POINTER_PHI11]], i64 [[TMP25]]
; STRIDED-NEXT: [[TMP23:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; STRIDED-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; STRIDED-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
; STRIDED: middle.block:
; STRIDED-NEXT: br label [[EXIT:%.*]]
; STRIDED: scalar.ph:
@@ -1170,7 +1170,7 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) {
; STRIDED-NEXT: [[PTR2_NEXT]] = getelementptr inbounds i8, ptr [[PTR2]], i64 [[STRIDE]]
; STRIDED-NEXT: [[NEXTI]] = add i64 [[I]], 1
; STRIDED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; STRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
+; STRIDED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP18:![0-9]+]]
; STRIDED: exit:
; STRIDED-NEXT: ret void
;
@@ -1318,7 +1318,7 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) {
; NOSTRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP3]]
; NOSTRIDED-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; NOSTRIDED-NEXT: [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; NOSTRIDED-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; NOSTRIDED-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
; NOSTRIDED: middle.block:
; NOSTRIDED-NEXT: br label [[LOOP:%.*]]
; NOSTRIDED: exit:
@@ -1402,7 +1402,7 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) {
; STRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP3]]
; STRIDED-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; STRIDED-NEXT: [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; STRIDED-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; STRIDED-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
; STRIDED: middle.block:
; STRIDED-NEXT: br label [[LOOP:%.*]]
; STRIDED: exit:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
index 0c22a9e..46daee4 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
@@ -142,7 +142,7 @@ define i32 @load_factor_4_with_gap(i64 %n, ptr noalias %a) {
; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]]
; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; IF-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP12]])
; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
@@ -267,7 +267,7 @@ define void @store_factor_4_with_gap(i32 %n, ptr noalias %a) {
; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP6]]
; IF-EVL-NEXT: [[VEC_IND_NEXT5]] = add <vscale x 4 x i32> [[VEC_IND2]], [[BROADCAST_SPLAT]]
; IF-EVL-NEXT: [[TMP8:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
-; IF-EVL-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; IF-EVL-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
; IF-EVL: exit:
@@ -382,7 +382,7 @@ define i32 @load_factor_4_with_tail_gap(i64 %n, ptr noalias %a) {
; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]]
; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IF-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP12]])
; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
@@ -508,7 +508,7 @@ define void @store_factor_4_with_tail_gap(i32 %n, ptr noalias %a) {
; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP6]]
; IF-EVL-NEXT: [[VEC_IND_NEXT5]] = add <vscale x 4 x i32> [[VEC_IND2]], [[BROADCAST_SPLAT]]
; IF-EVL-NEXT: [[TMP8:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
-; IF-EVL-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; IF-EVL-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
; IF-EVL: exit:
@@ -621,7 +621,7 @@ define i32 @load_factor_4_reverse(i64 %n, ptr noalias %a) {
; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]]
; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP16]])
; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
index bae97e5..c34417b 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
@@ -129,7 +129,7 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
; SCALABLE-NEXT: store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP8]], align 8
; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; SCALABLE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SCALABLE-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; SCALABLE-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; SCALABLE: [[MIDDLE_BLOCK]]:
; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]]
; SCALABLE-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
@@ -143,7 +143,7 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
; SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8
; SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; SCALABLE: [[FOR_END]]:
; SCALABLE-NEXT: [[V_LCSSA:%.*]] = phi i64 [ [[V]], %[[FOR_BODY]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
; SCALABLE-NEXT: ret i64 [[V_LCSSA]]
@@ -204,7 +204,7 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
; TF-SCALABLE-NEXT: store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[ARRAYIDX]], align 8
; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], [[TMP3]]
; TF-SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; TF-SCALABLE: [[MIDDLE_BLOCK]]:
; TF-SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]]
; TF-SCALABLE-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
@@ -218,7 +218,7 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
; TF-SCALABLE-NEXT: store i64 [[V1]], ptr [[ARRAYIDX1]], align 8
; TF-SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; TF-SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; TF-SCALABLE: [[FOR_END]]:
; TF-SCALABLE-NEXT: [[V_LCSSA:%.*]] = phi i64 [ [[V1]], %[[FOR_BODY]] ], [ [[V]], %[[MIDDLE_BLOCK]] ]
; TF-SCALABLE-NEXT: ret i64 [[V_LCSSA]]
@@ -269,7 +269,7 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
; SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; SCALABLE-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; SCALABLE: [[MIDDLE_BLOCK]]:
; SCALABLE-NEXT: br label %[[FOR_END:.*]]
; SCALABLE: [[FOR_END]]:
@@ -350,7 +350,7 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
; TF-SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
; TF-SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; TF-SCALABLE-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; TF-SCALABLE: [[MIDDLE_BLOCK]]:
; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]]
; TF-SCALABLE: [[FOR_END]]:
@@ -399,7 +399,7 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt
; SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]]
; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
; SCALABLE-NEXT: [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; SCALABLE-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; SCALABLE: [[MIDDLE_BLOCK]]:
; SCALABLE-NEXT: br label %[[FOR_END:.*]]
; SCALABLE: [[FOR_END]]:
@@ -457,7 +457,7 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt
; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP8]], [[INDEX]]
; TF-SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
; TF-SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; TF-SCALABLE-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; TF-SCALABLE: [[MIDDLE_BLOCK]]:
; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]]
; TF-SCALABLE: [[FOR_END]]:
@@ -499,7 +499,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
; SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP10]], [[INDEX]]
; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]]
; SCALABLE-NEXT: [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; SCALABLE-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; SCALABLE: [[MIDDLE_BLOCK]]:
; SCALABLE-NEXT: br label %[[FOR_END:.*]]
; SCALABLE: [[FOR_END]]:
@@ -557,7 +557,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP7]], [[INDEX]]
; TF-SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]]
; TF-SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; TF-SCALABLE: [[MIDDLE_BLOCK]]:
; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]]
; TF-SCALABLE: [[FOR_END]]:
@@ -608,7 +608,7 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
; SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
; SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; SCALABLE-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; SCALABLE: [[MIDDLE_BLOCK]]:
; SCALABLE-NEXT: br label %[[FOR_END:.*]]
; SCALABLE: [[FOR_END]]:
@@ -679,7 +679,7 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
; TF-SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]]
; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
; TF-SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; TF-SCALABLE-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; TF-SCALABLE: [[MIDDLE_BLOCK]]:
; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]]
; TF-SCALABLE: [[FOR_END]]:
@@ -731,7 +731,7 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP14]]
; SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; SCALABLE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; SCALABLE-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; SCALABLE: [[MIDDLE_BLOCK]]:
; SCALABLE-NEXT: br label %[[FOR_END:.*]]
; SCALABLE: [[FOR_END]]:
@@ -812,7 +812,7 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
; TF-SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; TF-SCALABLE-NEXT: [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; TF-SCALABLE-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; TF-SCALABLE: [[MIDDLE_BLOCK]]:
; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]]
; TF-SCALABLE: [[FOR_END]]:
@@ -860,7 +860,7 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap
; SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP10]], [[INDEX]]
; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]]
; SCALABLE-NEXT: [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; SCALABLE-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; SCALABLE: [[MIDDLE_BLOCK]]:
; SCALABLE-NEXT: br label %[[FOR_END:.*]]
; SCALABLE: [[FOR_END]]:
@@ -918,7 +918,7 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap
; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP7]], [[INDEX]]
; TF-SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]]
; TF-SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; TF-SCALABLE: [[MIDDLE_BLOCK]]:
; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]]
; TF-SCALABLE: [[FOR_END]]:
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll
index b106f99..1153d18 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll
@@ -6,7 +6,7 @@
; Check that the addresses for a scalarized memory access is not extracted
; from a vector register.
-define i32 @foo(ptr nocapture %A) {
+define void @foo(ptr nocapture %A) {
; CHECK-LABEL: @foo(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[VECTOR_PH:%.*]]
@@ -27,7 +27,7 @@ define i32 @foo(ptr nocapture %A) {
; CHECK: middle.block:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.end:
-; CHECK-NEXT: ret i32 poison
+; CHECK-NEXT: ret void
;
entry:
@@ -44,12 +44,12 @@ for.body:
br i1 %exitcond, label %for.end, label %for.body
for.end:
- ret i32 poison
+ ret void
}
; Check that a load of address is scalarized.
-define i32 @foo1(ptr nocapture noalias %A, ptr nocapture %PtrPtr) {
+define void @foo1(ptr nocapture noalias %A, ptr nocapture %PtrPtr) {
; CHECK-LABEL: @foo1(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[VECTOR_PH:%.*]]
@@ -74,7 +74,7 @@ define i32 @foo1(ptr nocapture noalias %A, ptr nocapture %PtrPtr) {
; CHECK: middle.block:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.end:
-; CHECK-NEXT: ret i32 poison
+; CHECK-NEXT: ret void
;
entry:
@@ -93,5 +93,5 @@ for.body:
br i1 %exitcond, label %for.end, label %for.body
for.end:
- ret i32 poison
+ ret void
}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/avx1.ll b/llvm/test/Transforms/LoopVectorize/X86/avx1.ll
index 9e20586..44fb8cb 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/avx1.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/avx1.ll
@@ -6,8 +6,8 @@ target triple = "x86_64-apple-macosx10.8.0"
; CHECK-LABEL: @read_mod_write_single_ptr(
; CHECK: load <8 x float>
-; CHECK: ret i32
-define i32 @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
+; CHECK: ret void
+define void @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
%1 = icmp sgt i32 %n, 0
br i1 %1, label %.lr.ph, label %._crit_edge
@@ -23,15 +23,15 @@ define i32 @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable
br i1 %exitcond, label %._crit_edge, label %.lr.ph
._crit_edge: ; preds = %.lr.ph, %0
- ret i32 undef
+ ret void
}
; CHECK-LABEL: @read_mod_i64(
; SLOWMEM32: load <2 x i64>
; FASTMEM32: load <4 x i64>
-; CHECK: ret i32
-define i32 @read_mod_i64(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
+; CHECK: ret void
+define void @read_mod_i64(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
%1 = icmp sgt i32 %n, 0
br i1 %1, label %.lr.ph, label %._crit_edge
@@ -47,6 +47,6 @@ define i32 @read_mod_i64(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
br i1 %exitcond, label %._crit_edge, label %.lr.ph
._crit_edge: ; preds = %.lr.ph, %0
- ret i32 undef
+ ret void
}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
index e11b1ad..27d5e64 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
@@ -166,7 +166,6 @@ attributes #0 = { "target-cpu"="knl" }
; CHECK: LV: Found uniform instruction: {{%.*}} = icmp eq i32 {{%.*}}, 0
; CHECK-NOT: LV: Found uniform instruction: {{%.*}} = load i32, ptr {{%.*}}, align 1
; CHECK: LV: Found not uniform due to requiring predication: {{%.*}} = load i32, ptr {{%.*}}, align 1
-; CHECK: LV: Found scalar instruction: {{%.*}} = getelementptr inbounds [3 x i32], ptr @a, i32 0, i32 {{%.*}}
;
;
@a = internal constant [3 x i32] [i32 7, i32 7, i32 0], align 1
@@ -215,8 +214,9 @@ define void @PR40816() #1 {
; FORCE-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4
; FORCE-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; FORCE: [[MIDDLE_BLOCK]]:
-; FORCE-NEXT: br [[RETURN:label %.*]]
-; FORCE: [[SCALAR_PH:.*:]]
+; FORCE-NEXT: br label %[[RETURN:.*]]
+; FORCE: [[RETURN]]:
+; FORCE-NEXT: ret void
;
entry:
br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
index 6d2cda4..0287645 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
@@ -4,7 +4,7 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0"
-define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwind uwtable ssp {
+define void @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwind uwtable ssp {
; CHECK-LABEL: @conversion_cost1(
; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 3
; CHECK-NEXT: br i1 [[TMP1]], label [[ITER_CHECK:%.*]], label [[DOT_CRIT_EDGE:%.*]]
@@ -37,7 +37,7 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
; CHECK: vec.epilog.iter.check:
; CHECK-NEXT: [[IND_END5:%.*]] = add i64 3, [[N_VEC]]
; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
-; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 3, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -58,7 +58,7 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX7]], 4
; CHECK-NEXT: [[VEC_IND_NEXT9]] = add <4 x i8> [[VEC_IND8]], splat (i8 4)
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC3]]
-; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: vec.epilog.middle.block:
; CHECK-NEXT: [[CMP_N12:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC3]]
; CHECK-NEXT: br i1 [[CMP_N12]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -73,11 +73,11 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: ._crit_edge.loopexit:
; CHECK-NEXT: br label [[DOT_CRIT_EDGE]]
; CHECK: ._crit_edge:
-; CHECK-NEXT: ret i32 undef
+; CHECK-NEXT: ret void
;
%1 = icmp sgt i32 %n, 3
br i1 %1, label %.lr.ph, label %._crit_edge
@@ -93,10 +93,10 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
br i1 %exitcond, label %._crit_edge, label %.lr.ph
._crit_edge: ; preds = %.lr.ph, %0
- ret i32 undef
+ ret void
}
-define i32 @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwind uwtable ssp {
+define void @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwind uwtable ssp {
; CHECK-LABEL: @conversion_cost2(
; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 9
; CHECK-NEXT: br i1 [[TMP1]], label [[DOTLR_PH_PREHEADER:%.*]], label [[DOT_CRIT_EDGE:%.*]]
@@ -136,7 +136,7 @@ define i32 @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD_3]], splat (i64 2)
; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -152,11 +152,11 @@ define i32 @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: ._crit_edge.loopexit:
; CHECK-NEXT: br label [[DOT_CRIT_EDGE]]
; CHECK: ._crit_edge:
-; CHECK-NEXT: ret i32 undef
+; CHECK-NEXT: ret void
;
%1 = icmp sgt i32 %n, 9
br i1 %1, label %.lr.ph, label %._crit_edge
@@ -173,5 +173,5 @@ define i32 @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
br i1 %exitcond, label %._crit_edge, label %.lr.ph
._crit_edge: ; preds = %.lr.ph, %0
- ret i32 undef
+ ret void
}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index 9453ad7..725fa49 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -540,6 +540,8 @@ define i64 @cost_assume(ptr %end, i64 %N) {
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i64 [[N:%.*]], 0
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]])
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]])
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -551,14 +553,6 @@ define i64 @cost_assume(ptr %end, i64 %N) {
; CHECK-NEXT: [[TMP8]] = add <2 x i64> [[VEC_PHI2]], splat (i64 1)
; CHECK-NEXT: [[TMP9]] = add <2 x i64> [[VEC_PHI3]], splat (i64 1)
; CHECK-NEXT: [[TMP10]] = add <2 x i64> [[VEC_PHI4]], splat (i64 1)
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]])
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]])
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]])
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]])
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]])
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]])
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]])
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]])
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll b/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll
index af5c921..fa3b4a66 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll
@@ -13,7 +13,7 @@ target triple = "x86_64-unknown-linux"
;CHECK-LABEL: func1x6(
;CHECK: <4 x i32>
;CHECK: ret
-define i32 @func1x6(ptr nocapture %out, ptr nocapture %A, ptr nocapture %B, ptr nocapture %C, ptr nocapture %D, ptr nocapture %E, ptr nocapture %F) {
+define void @func1x6(ptr nocapture %out, ptr nocapture %A, ptr nocapture %B, ptr nocapture %C, ptr nocapture %D, ptr nocapture %E, ptr nocapture %F) {
entry:
br label %for.body
@@ -40,14 +40,14 @@ for.body: ; preds = %for.body, %entry
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.body
- ret i32 undef
+ ret void
}
; We are vectorizing with 12 runtime checks.
;CHECK-LABEL: func2x6(
;CHECK: <4 x i32>
;CHECK: ret
-define i32 @func2x6(ptr nocapture %out, ptr nocapture %out2, ptr nocapture %A, ptr nocapture %B, ptr nocapture %C, ptr nocapture %D, ptr nocapture %E, ptr nocapture %F) {
+define void @func2x6(ptr nocapture %out, ptr nocapture %out2, ptr nocapture %A, ptr nocapture %B, ptr nocapture %C, ptr nocapture %D, ptr nocapture %E, ptr nocapture %F) {
entry:
br label %for.body
@@ -85,5 +85,5 @@ for.body: ; preds = %for.body, %entry
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.body
- ret i32 undef
+ ret void
}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/unroll-pm.ll b/llvm/test/Transforms/LoopVectorize/X86/unroll-pm.ll
index 8971dfe..47355e7 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/unroll-pm.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/unroll-pm.ll
@@ -11,7 +11,7 @@ target triple = "x86_64-apple-macosx10.8.0"
;CHECK-NOUNRL: store <4 x i32>
;CHECK-NOUNRL-NOT: store <4 x i32>
;CHECK-NOUNRL: ret
-define i32 @bar(ptr nocapture %A, i32 %n) nounwind uwtable ssp {
+define void @bar(ptr nocapture %A, i32 %n) nounwind uwtable ssp {
%1 = icmp sgt i32 %n, 0
br i1 %1, label %.lr.ph, label %._crit_edge
@@ -27,5 +27,5 @@ define i32 @bar(ptr nocapture %A, i32 %n) nounwind uwtable ssp {
br i1 %exitcond, label %._crit_edge, label %.lr.ph
._crit_edge: ; preds = %.lr.ph, %0
- ret i32 undef
+ ret void
}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
index 28de5c7..56f0b85 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
@@ -58,7 +58,7 @@ define void @vectorized(ptr noalias nocapture %A, ptr noalias nocapture readonly
; CHECK: middle.block:
; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF5:![0-9]+]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
@@ -72,7 +72,7 @@ define void @vectorized(ptr noalias nocapture %A, ptr noalias nocapture readonly
; CHECK-NEXT: store <4 x float> [[TMP21]], ptr [[TMP19]], align 4, !llvm.access.group [[ACC_GRP0]]
; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX8]], 4
; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT11]], 20
-; CHECK-NEXT: br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: vec.epilog.middle.block:
; CHECK-NEXT: br i1 true, label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
; CHECK: vec.epilog.scalar.ph:
@@ -88,7 +88,7 @@ define void @vectorized(ptr noalias nocapture %A, ptr noalias nocapture readonly
; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP0]]
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
@@ -132,14 +132,14 @@ define void @vectorized1(ptr noalias nocapture %A, ptr noalias nocapture readonl
; CHECK-NEXT: [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], splat (i64 19)
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP2]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP7:![0-9]+]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP2]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP8:![0-9]+]]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP4]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP7]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP4]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP8]]
; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]]
-; CHECK-NEXT: call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP6]], ptr [[TMP4]], i32 4, <8 x i1> [[TMP1]]), !llvm.access.group [[ACC_GRP7]]
+; CHECK-NEXT: call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP6]], ptr [[TMP4]], i32 4, <8 x i1> [[TMP1]]), !llvm.access.group [[ACC_GRP8]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
-; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.end:
@@ -180,14 +180,14 @@ define void @vectorized2(ptr noalias nocapture %A, ptr noalias nocapture readonl
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP7]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP8]]
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP7]]
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP8]]
; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-NEXT: store <8 x float> [[TMP5]], ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP7]]
+; CHECK-NEXT: store <8 x float> [[TMP5]], ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP8]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.end:
diff --git a/llvm/test/Transforms/LoopVectorize/assume.ll b/llvm/test/Transforms/LoopVectorize/assume.ll
index 65c12a1..224ec4a6 100644
--- a/llvm/test/Transforms/LoopVectorize/assume.ll
+++ b/llvm/test/Transforms/LoopVectorize/assume.ll
@@ -34,8 +34,9 @@ define void @test1(ptr noalias nocapture %a, ptr noalias nocapture readonly %b)
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600
; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: br [[FOR_END:label %.*]]
-; CHECK: [[SCALAR_PH:.*:]]
+; CHECK-NEXT: br label %[[FOR_END:.*]]
+; CHECK: [[FOR_END]]:
+; CHECK-NEXT: ret void
;
entry:
br label %for.body
@@ -73,29 +74,28 @@ define void @test2(ptr noalias %a, ptr noalias %b) {
; CHECK-NEXT: [[MASKCOND4:%.*]] = icmp eq i64 [[MASKEDPTR3]], 0
; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]])
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]])
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]])
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]])
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 2
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP3]], align 4
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x float>, ptr [[TMP4]], align 4
; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[WIDE_LOAD]], splat (float 1.000000e+00)
; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x float> [[WIDE_LOAD1]], splat (float 1.000000e+00)
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]])
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]])
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 2
; CHECK-NEXT: store <2 x float> [[TMP5]], ptr [[TMP7]], align 4
; CHECK-NEXT: store <2 x float> [[TMP6]], ptr [[TMP8]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600
-; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: br [[FOR_END:label %.*]]
-; CHECK: [[SCALAR_PH:.*:]]
+; CHECK-NEXT: br label %[[FOR_END:.*]]
+; CHECK: [[FOR_END]]:
+; CHECK-NEXT: ret void
;
entry:
%ptrint = ptrtoint ptr %a to i64
@@ -163,7 +163,7 @@ define void @predicated_assume(ptr noalias nocapture readonly %a, ptr noalias no
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], splat (i64 2)
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], [[FOR_COND_CLEANUP_LOOPEXIT:label %.*]], label %[[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/bsd_regex.ll b/llvm/test/Transforms/LoopVectorize/bsd_regex.ll
index f64255f..b7aa958 100644
--- a/llvm/test/Transforms/LoopVectorize/bsd_regex.ll
+++ b/llvm/test/Transforms/LoopVectorize/bsd_regex.ll
@@ -8,7 +8,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
; When scalarizing stores we need to preserve the original order.
; Make sure that we are extracting in the correct order (0101, and not 0011).
-define i32 @foo(ptr nocapture %A) {
+define void @foo(ptr nocapture %A) {
; CHECK-LABEL: @foo(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[VECTOR_PH:%.*]]
@@ -39,7 +39,7 @@ define i32 @foo(ptr nocapture %A) {
; CHECK: middle.block:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.end:
-; CHECK-NEXT: ret i32 undef
+; CHECK-NEXT: ret void
;
entry:
br label %for.body
@@ -55,7 +55,7 @@ for.body:
br i1 %exitcond, label %for.end, label %for.body
for.end:
- ret i32 undef
+ ret void
}
diff --git a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll
index 1588d02..51255b2 100644
--- a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll
@@ -3,7 +3,7 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-define i32 @foo(ptr nocapture %A, ptr nocapture %B, i32 %n) {
+define void @foo(ptr nocapture %A, ptr nocapture %B, i32 %n) {
; CHECK-LABEL: @foo(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP26:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -73,7 +73,7 @@ define i32 @foo(ptr nocapture %A, ptr nocapture %B, i32 %n) {
; CHECK: for.end.loopexit:
; CHECK-NEXT: br label [[FOR_END]]
; CHECK: for.end:
-; CHECK-NEXT: ret i32 undef
+; CHECK-NEXT: ret void
;
entry:
%cmp26 = icmp sgt i32 %n, 0
@@ -106,11 +106,11 @@ if.end14:
br i1 %exitcond, label %for.end, label %for.body
for.end:
- ret i32 undef
+ ret void
}
; As above but with multiple variables set per block.
-define i32 @multi_variable_if_nest(ptr nocapture %A, ptr nocapture %B, i32 %n) {
+define void @multi_variable_if_nest(ptr nocapture %A, ptr nocapture %B, i32 %n) {
; CHECK-LABEL: @multi_variable_if_nest(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP26:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -188,7 +188,7 @@ define i32 @multi_variable_if_nest(ptr nocapture %A, ptr nocapture %B, i32 %n) {
; CHECK: for.end.loopexit:
; CHECK-NEXT: br label [[FOR_END]]
; CHECK: for.end:
-; CHECK-NEXT: ret i32 undef
+; CHECK-NEXT: ret void
;
entry:
%cmp26 = icmp sgt i32 %n, 0
@@ -224,5 +224,5 @@ if.end14:
br i1 %exitcond, label %for.end, label %for.body
for.end:
- ret i32 undef
+ ret void
}
diff --git a/llvm/test/Transforms/LoopVectorize/if-conversion.ll b/llvm/test/Transforms/LoopVectorize/if-conversion.ll
index 8a7f4a3..a88a9b14 100644
--- a/llvm/test/Transforms/LoopVectorize/if-conversion.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-conversion.ll
@@ -17,8 +17,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
; }
;}
-define i32 @function0(ptr nocapture %a, ptr nocapture %b, i32 %start, i32 %end) nounwind uwtable ssp {
-; CHECK-LABEL: define i32 @function0(
+define void @function0(ptr nocapture %a, ptr nocapture %b, i32 %start, i32 %end) nounwind uwtable ssp {
+; CHECK-LABEL: define void @function0(
; CHECK-SAME: ptr captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]], i32 [[START:%.*]], i32 [[END:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[CMP16:%.*]] = icmp slt i32 [[START]], [[END]]
@@ -94,7 +94,7 @@ define i32 @function0(ptr nocapture %a, ptr nocapture %b, i32 %start, i32 %end)
; CHECK: [[FOR_END_LOOPEXIT]]:
; CHECK-NEXT: br label %[[FOR_END]]
; CHECK: [[FOR_END]]:
-; CHECK-NEXT: ret i32 undef
+; CHECK-NEXT: ret void
;
entry:
%cmp16 = icmp slt i32 %start, %end
@@ -127,7 +127,7 @@ if.end:
br i1 %cmp, label %for.body, label %for.end
for.end:
- ret i32 undef
+ ret void
}
@@ -237,6 +237,8 @@ for.end: ; preds = %for.inc, %entry
; Handle PHI with single incoming value having a full mask.
; PR34523
+; NOTE: Changing PHI inputs from undef to poison leads to change in
+; behaviour of the test. Left as undef for now.
define void @PR34523() {
; CHECK-LABEL: define void @PR34523() {
; CHECK-NEXT: [[BB1:.*:]]
diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
index 742ee64..eea2237 100644
--- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
@@ -337,7 +337,7 @@ for.end: ; preds = %for.body
; }
; }
-define i32 @multiple_uniform_stores(ptr nocapture %var1, ptr nocapture readonly %var2, i32 %itr) #0 {
+define void @multiple_uniform_stores(ptr nocapture %var1, ptr nocapture readonly %var2, i32 %itr) #0 {
; CHECK-LABEL: @multiple_uniform_stores(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP20:%.*]] = icmp eq i32 [[ITR:%.*]], 0
@@ -429,7 +429,7 @@ define i32 @multiple_uniform_stores(ptr nocapture %var1, ptr nocapture readonly
; CHECK: for.end10.loopexit:
; CHECK-NEXT: br label [[FOR_END10]]
; CHECK: for.end10:
-; CHECK-NEXT: ret i32 undef
+; CHECK-NEXT: ret void
;
entry:
%cmp20 = icmp eq i32 %itr, 0
@@ -469,12 +469,12 @@ for.inc8: ; preds = %for.body3, %for.con
br i1 %exitcond26, label %for.end10, label %for.cond1.preheader
for.end10: ; preds = %for.inc8, %entry
- ret i32 undef
+ ret void
}
; second uniform store to the same address is conditional.
; we do not vectorize this.
-define i32 @multiple_uniform_stores_conditional(ptr nocapture %var1, ptr nocapture readonly %var2, i32 %itr) #0 {
+define void @multiple_uniform_stores_conditional(ptr nocapture %var1, ptr nocapture readonly %var2, i32 %itr) #0 {
; CHECK-LABEL: @multiple_uniform_stores_conditional(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP20:%.*]] = icmp eq i32 [[ITR:%.*]], 0
@@ -520,7 +520,7 @@ define i32 @multiple_uniform_stores_conditional(ptr nocapture %var1, ptr nocaptu
; CHECK: for.end10.loopexit:
; CHECK-NEXT: br label [[FOR_END10]]
; CHECK: for.end10:
-; CHECK-NEXT: ret i32 undef
+; CHECK-NEXT: ret void
;
entry:
%cmp20 = icmp eq i32 %itr, 0
@@ -567,7 +567,7 @@ for.inc8: ; preds = %for.body3, %for.con
br i1 %exitcond26, label %for.end10, label %for.cond1.preheader
for.end10: ; preds = %for.inc8, %entry
- ret i32 undef
+ ret void
}
; cannot vectorize loop with unsafe dependency between uniform load (%i10) and store
diff --git a/llvm/test/Transforms/LoopVectorize/memdep.ll b/llvm/test/Transforms/LoopVectorize/memdep.ll
index b891b43..d9d9eec 100644
--- a/llvm/test/Transforms/LoopVectorize/memdep.ll
+++ b/llvm/test/Transforms/LoopVectorize/memdep.ll
@@ -132,7 +132,7 @@ for.end:
; CHECK-LABEL: @f6
; CHECK-NOT: <2 x i32>
-define i32 @f6(ptr %a, i32 %tmp) {
+define void @f6(ptr %a, i32 %tmp) {
entry:
br label %for.body
@@ -149,7 +149,7 @@ for.body:
br i1 %exitcond, label %for.body, label %for.end
for.end:
- ret i32 undef
+ ret void
}
; Don't vectorize true loop carried dependencies that are not a multiple of the
diff --git a/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll b/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll
index 1533906..53dad3a 100644
--- a/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll
+++ b/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll
@@ -74,8 +74,7 @@ exit:
ret void
}
-; FIXME: Currently this mis-compiled when interleaving; all stores store the
-; last lane of the last part, instead of the last lane per part.
+; Check each unrolled store stores the last lane of the corresponding part.
; Test case for https://github.com/llvm/llvm-project/issues/162498.
define void @narrow_to_single_scalar_store_address_not_uniform_across_all_parts(ptr %dst) {
; VF4IC1-LABEL: define void @narrow_to_single_scalar_store_address_not_uniform_across_all_parts(
@@ -121,13 +120,15 @@ define void @narrow_to_single_scalar_store_address_not_uniform_across_all_parts(
; VF2IC2-NEXT: br label %[[VECTOR_BODY:.*]]
; VF2IC2: [[VECTOR_BODY]]:
; VF2IC2-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2IC2-NEXT: [[TMP7:%.*]] = add i32 [[INDEX]], 0
+; VF2IC2-NEXT: [[TMP8:%.*]] = add i32 [[INDEX]], 1
; VF2IC2-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 2
; VF2IC2-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 3
-; VF2IC2-NEXT: [[TMP2:%.*]] = lshr i32 [[INDEX]], 1
+; VF2IC2-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP7]], 1
; VF2IC2-NEXT: [[TMP3:%.*]] = lshr i32 [[TMP0]], 1
; VF2IC2-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[DST]], i32 [[TMP2]]
; VF2IC2-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[DST]], i32 [[TMP3]]
-; VF2IC2-NEXT: store i32 [[TMP1]], ptr [[TMP4]], align 4
+; VF2IC2-NEXT: store i32 [[TMP8]], ptr [[TMP4]], align 4
; VF2IC2-NEXT: store i32 [[TMP1]], ptr [[TMP5]], align 4
; VF2IC2-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; VF2IC2-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100
diff --git a/llvm/test/Transforms/LoopVectorize/operand-bundles.ll b/llvm/test/Transforms/LoopVectorize/operand-bundles.ll
new file mode 100644
index 0000000..ce07364
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/operand-bundles.ll
@@ -0,0 +1,227 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt -p loop-vectorize -force-vector-width=4 -S %s | FileCheck %s
+
+define void @call_loop_invariant_operand_bundle(ptr %dst, {float, float} %sv) {
+; CHECK-LABEL: define void @call_loop_invariant_operand_bundle(
+; CHECK-SAME: ptr [[DST:%.*]], { float, float } [[SV:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[TMP0:%.*]] = extractvalue { float, float } [[SV]], 0
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { float, float } [[SV]], 1
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT1]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr [[DST]], i32 [[INDEX]]
+; CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.pow.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[BROADCAST_SPLAT2]]) [ "deopt"(float 1.000000e+01) ]
+; CHECK-NEXT: store <4 x float> [[TMP3]], ptr [[TMP2]], align 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
+; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+ %a = extractvalue { float, float } %sv, 0
+ %b = extractvalue { float, float } %sv, 1
+ %addr = getelementptr float, ptr %dst, i32 %iv
+ %p = call float @llvm.pow.f32(float %a, float %b) [ "deopt"(float 10.0) ]
+ store float %p, ptr %addr
+ %iv.next = add nsw i32 %iv, 1
+ %cond = icmp ne i32 %iv.next, 1000
+ br i1 %cond, label %loop, label %exit
+
+exit:
+ ret void
+}
+
+define void @call_unknown_operand_bundle(ptr %dst, {float, float} %sv) {
+; CHECK-LABEL: define void @call_unknown_operand_bundle(
+; CHECK-SAME: ptr [[DST:%.*]], { float, float } [[SV:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[TMP0:%.*]] = extractvalue { float, float } [[SV]], 0
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { float, float } [[SV]], 1
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT1]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr [[DST]], i32 [[INDEX]]
+; CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.pow.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[BROADCAST_SPLAT2]]) [ "unknown"(ptr null) ]
+; CHECK-NEXT: store <4 x float> [[TMP3]], ptr [[TMP2]], align 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
+; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+ %a = extractvalue { float, float } %sv, 0
+ %b = extractvalue { float, float } %sv, 1
+ %addr = getelementptr float, ptr %dst, i32 %iv
+ %p = call float @llvm.pow.f32(float %a, float %b) [ "unknown"(ptr null) ]
+ store float %p, ptr %addr
+ %iv.next = add nsw i32 %iv, 1
+ %cond = icmp ne i32 %iv.next, 1000
+ br i1 %cond, label %loop, label %exit
+
+exit:
+ ret void
+}
+
+define void @call_cold_operand_bundle(ptr %dst, {float, float} %sv) {
+; CHECK-LABEL: define void @call_cold_operand_bundle(
+; CHECK-SAME: ptr [[DST:%.*]], { float, float } [[SV:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[TMP0:%.*]] = extractvalue { float, float } [[SV]], 0
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { float, float } [[SV]], 1
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT1]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr [[DST]], i32 [[INDEX]]
+; CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.pow.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[BROADCAST_SPLAT2]]) [ "cold"() ]
+; CHECK-NEXT: store <4 x float> [[TMP3]], ptr [[TMP2]], align 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
+; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+ %a = extractvalue { float, float } %sv, 0
+ %b = extractvalue { float, float } %sv, 1
+ %addr = getelementptr float, ptr %dst, i32 %iv
+ %p = call float @llvm.pow.f32(float %a, float %b) [ "cold"() ]
+ store float %p, ptr %addr
+ %iv.next = add nsw i32 %iv, 1
+ %cond = icmp ne i32 %iv.next, 1000
+ br i1 %cond, label %loop, label %exit
+
+exit:
+ ret void
+}
+
+define void @assume_loop_variant_operand_bundle(ptr noalias %a, ptr noalias %b) {
+; CHECK-LABEL: define void @assume_loop_variant_operand_bundle(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP8]], align 4
+; CHECK-NEXT: tail call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 [[TMP0]]) ]
+; CHECK-NEXT: tail call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 [[TMP1]]) ]
+; CHECK-NEXT: tail call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 [[TMP2]]) ]
+; CHECK-NEXT: tail call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 [[TMP3]]) ]
+; CHECK-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[WIDE_LOAD]], splat (float 1.000000e+00)
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEXT: store <4 x float> [[TMP5]], ptr [[TMP10]], align 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600
+; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %arrayidx = getelementptr inbounds float, ptr %b, i64 %iv
+ %0 = load float, ptr %arrayidx, align 4
+ %cmp1 = fcmp ogt float %0, 1.000000e+02
+ tail call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 %iv) ]
+ %add = fadd float %0, 1.000000e+00
+ %arrayidx5 = getelementptr inbounds float, ptr %a, i64 %iv
+ store float %add, ptr %arrayidx5, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv, 1599
+ br i1 %exitcond, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @assume_cold_operand_bundle(ptr noalias %a, ptr noalias %b) {
+; CHECK-LABEL: define void @assume_cold_operand_bundle(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: tail call void @llvm.assume(i1 true) [ "cold"() ]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[WIDE_LOAD]], splat (float 1.000000e+00)
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[TMP2]], align 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600
+; CHECK-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %arrayidx = getelementptr inbounds float, ptr %b, i64 %iv
+ %0 = load float, ptr %arrayidx, align 4
+ %cmp1 = fcmp ogt float %0, 1.000000e+02
+ tail call void @llvm.assume(i1 true) [ "cold"() ]
+ %add = fadd float %0, 1.000000e+00
+ %arrayidx5 = getelementptr inbounds float, ptr %a, i64 %iv
+ store float %add, ptr %arrayidx5, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv, 1599
+ br i1 %exitcond, label %exit, label %loop
+
+exit:
+ ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/partial-lcssa.ll b/llvm/test/Transforms/LoopVectorize/partial-lcssa.ll
index d700d48..f5e480c 100644
--- a/llvm/test/Transforms/LoopVectorize/partial-lcssa.ll
+++ b/llvm/test/Transforms/LoopVectorize/partial-lcssa.ll
@@ -10,7 +10,7 @@
; CHECK: store i64 %indvars.outer, ptr %O2, align 4
-define i64 @foo(ptr nocapture %A, ptr nocapture %B, i64 %n, i64 %m, ptr %O1, ptr %O2) {
+define void @foo(ptr nocapture %A, ptr nocapture %B, i64 %n, i64 %m, ptr %O1, ptr %O2) {
entry:
%cmp = icmp sgt i64 %n, 0
br i1 %cmp, label %for.body.outer.preheader, label %for.end.outer
@@ -50,5 +50,5 @@ for.end.outer.loopexit: ; preds = %for.end.inner
br label %for.end.outer
for.end.outer: ; preds = %for.end.outer.loopexit, %entry
- ret i64 undef
+ ret void
}
diff --git a/llvm/test/Transforms/LoopVectorize/pr28541.ll b/llvm/test/Transforms/LoopVectorize/pr28541.ll
index ad7f6e7..0a9c8c1 100644
--- a/llvm/test/Transforms/LoopVectorize/pr28541.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr28541.ll
@@ -28,7 +28,7 @@
; CHECK-NOT: vectorized loop
; CHECK-LABEL: fn1
-define i32 @fn1() {
+define void @fn1() {
entry:
%tmp2 = load i32, ptr @b, align 4
%dec3 = add nsw i32 %tmp2, -1
@@ -67,5 +67,5 @@ while.cond.while.end_crit_edge: ; preds = %while.cond
br label %while.end
while.end: ; preds = %while.cond.while.end_crit_edge, %entry
- ret i32 undef
+ ret void
}
diff --git a/llvm/test/Transforms/LoopVectorize/pr48832.ll b/llvm/test/Transforms/LoopVectorize/pr48832.ll
index b89be88..c6ebe85 100644
--- a/llvm/test/Transforms/LoopVectorize/pr48832.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr48832.ll
@@ -23,7 +23,7 @@ for.body: ; preds = %for.cond
br i1 true, label %cond.false, label %land.rhs
land.rhs: ; preds = %for.body
- br i1 poison, label %cond.end, label %cond.false
+ br i1 false, label %cond.end, label %cond.false
cond.false: ; preds = %for.body, %land.rhs
br label %cond.end
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-minmax-users-and-predicated.ll b/llvm/test/Transforms/LoopVectorize/reduction-minmax-users-and-predicated.ll
new file mode 100644
index 0000000..e4322cf
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/reduction-minmax-users-and-predicated.ll
@@ -0,0 +1,588 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt -p loop-vectorize -force-vector-width=4 -S %s | FileCheck %s
+
+define i32 @umax_phi_used_outside(ptr %src, i32 %n) {
+; CHECK-LABEL: define i32 @umax_phi_used_outside(
+; CHECK-SAME: ptr [[SRC:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MAX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[IV]]
+; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
+; CHECK-NEXT: [[L_EXT:%.*]] = zext i8 [[L]] to i32
+; CHECK-NEXT: [[SPEC_SELECT]] = tail call i32 @llvm.umax.i32(i32 [[MAX]], i32 [[L_EXT]])
+; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV]], [[N]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: [[MAX_LCSSA:%.*]] = phi i32 [ [[MAX]], %[[LOOP]] ]
+; CHECK-NEXT: ret i32 [[MAX_LCSSA]]
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+ %max = phi i32 [ 0, %entry ], [ %spec.select, %loop ]
+ %gep.src = getelementptr inbounds i8, ptr %src, i32 %iv
+ %l = load i8, ptr %gep.src
+ %l.ext = zext i8 %l to i32
+ %spec.select = tail call i32 @llvm.umax.i32(i32 %max, i32 %l.ext)
+ %iv.next = add i32 %iv, 1
+ %ec = icmp eq i32 %iv, %n
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret i32 %max
+}
+
+define i32 @chained_smax(i32 %x, ptr %src) {
+; CHECK-LABEL: define i32 @chained_smax(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE6:.*]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ <i8 0, i8 1, i8 2, i8 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP26:%.*]], %[[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <4 x i8> [[VEC_IND]], splat (i8 1)
+; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[BROADCAST_SPLAT]], <4 x i32> [[VEC_PHI]])
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
+; CHECK-NEXT: br i1 [[TMP2]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK: [[PRED_LOAD_IF]]:
+; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr [3 x i32], ptr [[SRC]], i64 [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i32 0
+; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]]
+; CHECK: [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP6]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1
+; CHECK-NEXT: br i1 [[TMP8]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2:.*]]
+; CHECK: [[PRED_LOAD_IF1]]:
+; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr [3 x i32], ptr [[SRC]], i64 [[TMP9]]
+; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
+; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP11]], i32 1
+; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE2]]
+; CHECK: [[PRED_LOAD_CONTINUE2]]:
+; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x i32> [ [[TMP7]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], %[[PRED_LOAD_IF1]] ]
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2
+; CHECK-NEXT: br i1 [[TMP14]], label %[[PRED_LOAD_IF3:.*]], label %[[PRED_LOAD_CONTINUE4:.*]]
+; CHECK: [[PRED_LOAD_IF3]]:
+; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr [3 x i32], ptr [[SRC]], i64 [[TMP15]]
+; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
+; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP17]], i32 2
+; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE4]]
+; CHECK: [[PRED_LOAD_CONTINUE4]]:
+; CHECK-NEXT: [[TMP19:%.*]] = phi <4 x i32> [ [[TMP13]], %[[PRED_LOAD_CONTINUE2]] ], [ [[TMP18]], %[[PRED_LOAD_IF3]] ]
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3
+; CHECK-NEXT: br i1 [[TMP20]], label %[[PRED_LOAD_IF5:.*]], label %[[PRED_LOAD_CONTINUE6]]
+; CHECK: [[PRED_LOAD_IF5]]:
+; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT: [[TMP22:%.*]] = getelementptr [3 x i32], ptr [[SRC]], i64 [[TMP21]]
+; CHECK-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
+; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP23]], i32 3
+; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE6]]
+; CHECK: [[PRED_LOAD_CONTINUE6]]:
+; CHECK-NEXT: [[TMP25:%.*]] = phi <4 x i32> [ [[TMP19]], %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], %[[PRED_LOAD_IF5]] ]
+; CHECK-NEXT: [[TMP26]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP25]], <4 x i32> [[TMP1]])
+; CHECK-NEXT: [[TMP27:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP26]], <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
+; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP27]])
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret i32 [[TMP28]]
+;
+entry:
+ br label %loop
+
+loop: ; preds = %loop, %entry
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %max = phi i32 [ 0, %entry ], [ %max.next, %loop ]
+ %gep.src = getelementptr [3 x i32], ptr %src, i64 %iv
+ %max.1 = tail call i32 @llvm.smax.i32(i32 %x, i32 %max)
+ %l = load i32, ptr %gep.src, align 4
+ %max.next = tail call i32 @llvm.smax.i32(i32 %l, i32 %max.1)
+ %iv.next = add i64 %iv, 1
+ %ec = icmp eq i64 %iv, 1
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret i32 %max.next
+}
+
+define void @smax_with_invariant_store_user(ptr noalias %src, ptr %dst, i64 %n) {
+; CHECK-LABEL: define void @smax_with_invariant_store_user(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; CHECK-NEXT: [[TMP2]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])
+; CHECK-NEXT: store i32 [[TMP4]], ptr [[DST]], align 4
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP4]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MAX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT: [[MAX_NEXT]] = tail call i32 @llvm.smax.i32(i32 [[MAX]], i32 [[L]])
+; CHECK-NEXT: store i32 [[MAX_NEXT]], ptr [[DST]], align 4
+; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %max = phi i32 [ 0, %entry ], [ %max.next, %loop ]
+ %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
+ %l = load i32, ptr %gep.src, align 4
+ %max.next = tail call i32 @llvm.smax.i32(i32 %max, i32 %l)
+ store i32 %max.next, ptr %dst, align 4
+ %iv.next = add i64 %iv, 1
+ %ec = icmp eq i64 %iv, %n
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @smax_with_multiple_invariant_store_user_same_addr(ptr noalias %src, ptr %dst, i64 %n) {
+; CHECK-LABEL: define void @smax_with_multiple_invariant_store_user_same_addr(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; CHECK-NEXT: [[TMP2]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])
+; CHECK-NEXT: store i32 [[TMP4]], ptr [[DST]], align 4
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP4]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MAX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT: [[MAX_NEXT]] = tail call i32 @llvm.smax.i32(i32 [[MAX]], i32 [[L]])
+; CHECK-NEXT: store i32 [[MAX_NEXT]], ptr [[DST]], align 4
+; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT: store i32 [[MAX_NEXT]], ptr [[DST]], align 4
+; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %max = phi i32 [ 0, %entry ], [ %max.next, %loop ]
+ %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
+ %l = load i32, ptr %gep.src, align 4
+ %max.next = tail call i32 @llvm.smax.i32(i32 %max, i32 %l)
+ store i32 %max.next, ptr %dst, align 4
+ %iv.next = add i64 %iv, 1
+ store i32 %max.next, ptr %dst, align 4
+ %ec = icmp eq i64 %iv, %n
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @smax_with_multiple_invariant_store_user_same_addr2(ptr noalias %src, ptr %dst, i64 %n) {
+; CHECK-LABEL: define void @smax_with_multiple_invariant_store_user_same_addr2(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MAX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT: [[MAX_NEXT]] = tail call i32 @llvm.smax.i32(i32 [[MAX]], i32 [[L]])
+; CHECK-NEXT: store i32 [[MAX_NEXT]], ptr [[DST]], align 4
+; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT: store i32 0, ptr [[DST]], align 4
+; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %max = phi i32 [ 0, %entry ], [ %max.next, %loop ]
+ %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
+ %l = load i32, ptr %gep.src, align 4
+ %max.next = tail call i32 @llvm.smax.i32(i32 %max, i32 %l)
+ store i32 %max.next, ptr %dst, align 4
+ %iv.next = add i64 %iv, 1
+ store i32 0, ptr %dst, align 4
+ %ec = icmp eq i64 %iv, %n
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @smax_with_multiple_invariant_store_user_same_addr3(ptr noalias %src, ptr %dst, i64 %n) {
+; CHECK-LABEL: define void @smax_with_multiple_invariant_store_user_same_addr3(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; CHECK-NEXT: [[TMP2]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])
+; CHECK-NEXT: store i32 [[TMP4]], ptr [[DST]], align 4
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP4]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MAX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT: [[MAX_NEXT]] = tail call i32 @llvm.smax.i32(i32 [[MAX]], i32 [[L]])
+; CHECK-NEXT: store i32 0, ptr [[DST]], align 4
+; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT: store i32 [[MAX_NEXT]], ptr [[DST]], align 4
+; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %max = phi i32 [ 0, %entry ], [ %max.next, %loop ]
+ %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
+ %l = load i32, ptr %gep.src, align 4
+ %max.next = tail call i32 @llvm.smax.i32(i32 %max, i32 %l)
+ store i32 0, ptr %dst, align 4
+ %iv.next = add i64 %iv, 1
+ store i32 %max.next, ptr %dst, align 4
+ %ec = icmp eq i64 %iv, %n
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @smax_with_multiple_invariant_store_user_different_addr(ptr noalias %src, ptr noalias %dst, ptr noalias %dst.2, i64 %n) {
+; CHECK-LABEL: define void @smax_with_multiple_invariant_store_user_different_addr(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], ptr noalias [[DST_2:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MAX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT: [[MAX_NEXT]] = tail call i32 @llvm.smax.i32(i32 [[MAX]], i32 [[L]])
+; CHECK-NEXT: store i32 [[MAX_NEXT]], ptr [[DST]], align 4
+; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT: store i32 [[MAX_NEXT]], ptr [[DST_2]], align 4
+; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %max = phi i32 [ 0, %entry ], [ %max.next, %loop ]
+ %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
+ %l = load i32, ptr %gep.src, align 4
+ %max.next = tail call i32 @llvm.smax.i32(i32 %max, i32 %l)
+ store i32 %max.next, ptr %dst, align 4
+ %iv.next = add i64 %iv, 1
+ store i32 %max.next, ptr %dst.2, align 4
+ %ec = icmp eq i64 %iv, %n
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define i32 @chained_instructions_feeding_max1(i32 %x, ptr %src) {
+; CHECK-LABEL: define i32 @chained_instructions_feeding_max1(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MAX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr [3 x i32], ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MAX]], [[L]]
+; CHECK-NEXT: [[MAX_NEXT]] = tail call i32 @llvm.smax.i32(i32 [[ADD]], i32 [[L]])
+; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 1
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi i32 [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: ret i32 [[MAX_NEXT_LCSSA]]
+;
+entry:
+ br label %loop
+
+loop: ; preds = %loop, %entry
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %max = phi i32 [ 0, %entry ], [ %max.next, %loop ]
+ %gep.src = getelementptr [3 x i32], ptr %src, i64 %iv
+ %l = load i32, ptr %gep.src, align 4
+ %add = add i32 %max, %l
+ %max.next = tail call i32 @llvm.smax.i32(i32 %add, i32 %l)
+ %iv.next = add i64 %iv, 1
+ %ec = icmp eq i64 %iv, 1
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret i32 %max.next
+}
+
+define i32 @chained_instructions_feeding_max2(i32 %x, ptr %src) {
+; CHECK-LABEL: define i32 @chained_instructions_feeding_max2(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[MAX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr [3 x i32], ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT: [[MAX_1:%.*]] = tail call i32 @llvm.smax.i32(i32 [[X]], i32 [[MAX]])
+; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT: [[ADD:%.*]] = add i32 [[L]], [[MAX_1]]
+; CHECK-NEXT: [[MAX_NEXT]] = tail call i32 @llvm.smax.i32(i32 [[ADD]], i32 100)
+; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 1
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi i32 [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT: ret i32 [[MAX_NEXT_LCSSA]]
+;
+entry:
+ br label %loop
+
+loop: ; preds = %loop, %entry
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %max = phi i32 [ 0, %entry ], [ %max.next, %loop ]
+ %gep.src = getelementptr [3 x i32], ptr %src, i64 %iv
+ %max.1 = tail call i32 @llvm.smax.i32(i32 %x, i32 %max)
+ %l = load i32, ptr %gep.src, align 4
+ %add = add i32 %l, %max.1
+ %max.next = tail call i32 @llvm.smax.i32(i32 %add, i32 100)
+ %iv.next = add i64 %iv, 1
+ %ec = icmp eq i64 %iv, 1
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret i32 %max.next
+}
+
+
+define i32 @test_predicated_smin(ptr %src) {
+; CHECK-LABEL: define i32 @test_predicated_smin(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PREDPHI:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr float, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = fcmp une <4 x float> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = fdiv <4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00)
+; CHECK-NEXT: [[TMP3:%.*]] = fptosi <4 x float> [[TMP2]] to <4 x i32>
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[VEC_PHI]], <4 x i32> [[TMP3]])
+; CHECK-NEXT: [[PREDPHI]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 112
+; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[PREDPHI]])
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret i32 [[TMP6]]
+;
+entry:
+ br label %loop.header
+
+loop.header:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+ %min = phi i32 [ 0, %entry ], [ %min.merge, %loop.latch ]
+ %gep.src = getelementptr float, ptr %src, i64 %iv
+ %l = load float, ptr %gep.src, align 4
+ %c = fcmp une float %l, 0.0
+ br i1 %c, label %then, label %loop.latch
+
+then:
+ %div = fdiv float %l, 3.0
+ %div.i32 = fptosi float %div to i32
+ %min.next = tail call i32 @llvm.smin.i32(i32 %min, i32 %div.i32)
+ br label %loop.latch
+
+loop.latch:
+ %min.merge = phi i32 [ %min.next, %then ], [ %min, %loop.header ]
+ %iv.next = add i64 %iv, 1
+ %ec = icmp eq i64 %iv, 111
+ br i1 %ec, label %exit, label %loop.header
+
+exit:
+ ret i32 %min.merge
+}
+
+define i32 @smax_reduction_multiple_incoming(ptr %src, i32 %n, i1 %cond) {
+; CHECK-LABEL: define i32 @smax_reduction_multiple_incoming(
+; CHECK-SAME: ptr [[SRC:%.*]], i32 [[N:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br i1 [[COND]], label %[[LOOP_HEADER_PREHEADER:.*]], label %[[ELSE:.*]]
+; CHECK: [[ELSE]]:
+; CHECK-NEXT: br label %[[LOOP_HEADER_PREHEADER]]
+; CHECK: [[LOOP_HEADER_PREHEADER]]:
+; CHECK-NEXT: [[IV_PH:%.*]] = phi i32 [ 10, %[[ELSE]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[MAX_PH:%.*]] = phi i32 [ 5, %[[ELSE]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], 1
+; CHECK-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[IV_PH]]
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; CHECK: [[VECTOR_SCEVCHECK]]:
+; CHECK-NEXT: [[TMP2:%.*]] = icmp slt i32 [[N]], [[IV_PH]]
+; CHECK-NEXT: br i1 [[TMP2]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[IV_PH]], [[N_VEC]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[MAX_PH]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[IV_PH]], [[INDEX]]
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[OFFSET_IDX]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-NEXT: [[TMP5]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ [[IV_PH]], %[[LOOP_HEADER_PREHEADER]] ], [ [[IV_PH]], %[[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ [[MAX_PH]], %[[LOOP_HEADER_PREHEADER]] ], [ [[MAX_PH]], %[[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
+; CHECK: [[LOOP_HEADER]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[LOOP_HEADER]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT: [[MAX:%.*]] = phi i32 [ [[MAX_NEXT:%.*]], %[[LOOP_HEADER]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[IV]]
+; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT: [[MAX_NEXT]] = tail call i32 @llvm.smax.i32(i32 [[MAX]], i32 [[L]])
+; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV]], [[N]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi i32 [ [[MAX_NEXT]], %[[LOOP_HEADER]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: ret i32 [[MAX_NEXT_LCSSA]]
+;
+entry:
+ br i1 %cond, label %loop.header, label %else
+
+else:
+ br label %loop.header
+
+loop.header:
+ %iv = phi i32 [ 0, %entry ], [ 10, %else ], [ %iv.next, %loop.header ]
+ %max = phi i32 [ 0, %entry ], [ 5, %else ], [ %max.next, %loop.header ]
+ %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv
+ %l = load i32, ptr %gep.src, align 4
+ %max.next = tail call i32 @llvm.smax.i32(i32 %max, i32 %l)
+ %iv.next = add i32 %iv, 1
+ %ec = icmp eq i32 %iv, %n
+ br i1 %ec, label %exit, label %loop.header
+
+exit:
+ ret i32 %max.next
+}
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check.ll b/llvm/test/Transforms/LoopVectorize/runtime-check.ll
index f87be5a..6ea227f 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check.ll
@@ -10,7 +10,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
; a[i] = b[i] * 3;
; }
-define i32 @foo(ptr nocapture %a, ptr nocapture %b, i32 %n) nounwind uwtable ssp {
+define void @foo(ptr nocapture %a, ptr nocapture %b, i32 %n) nounwind uwtable ssp {
; CHECK-LABEL: @foo(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[B2:%.*]] = ptrtoint ptr [[B:%.*]] to i64, !dbg [[DBG4:![0-9]+]]
@@ -58,7 +58,7 @@ define i32 @foo(ptr nocapture %a, ptr nocapture %b, i32 %n) nounwind uwtable ssp
; CHECK: for.end.loopexit:
; CHECK-NEXT: br label [[FOR_END]], !dbg [[DBG14:![0-9]+]]
; CHECK: for.end:
-; CHECK-NEXT: ret i32 undef, !dbg [[DBG14]]
+; CHECK-NEXT: ret void, !dbg [[DBG14]]
;
; FORCED_OPTSIZE-LABEL: @foo(
; FORCED_OPTSIZE-NEXT: entry:
@@ -80,7 +80,7 @@ define i32 @foo(ptr nocapture %a, ptr nocapture %b, i32 %n) nounwind uwtable ssp
; FORCED_OPTSIZE: for.end.loopexit:
; FORCED_OPTSIZE-NEXT: br label [[FOR_END]], !dbg [[DBG10:![0-9]+]]
; FORCED_OPTSIZE: for.end:
-; FORCED_OPTSIZE-NEXT: ret i32 undef, !dbg [[DBG10]]
+; FORCED_OPTSIZE-NEXT: ret void, !dbg [[DBG10]]
;
entry:
%cmp6 = icmp sgt i32 %n, 0, !dbg !6
@@ -99,7 +99,7 @@ for.body: ; preds = %entry, %for.body
br i1 %exitcond, label %for.end, label %for.body, !dbg !7
for.end: ; preds = %for.body, %entry
- ret i32 undef, !dbg !8
+ ret void, !dbg !8
}
; Make sure that we try to vectorize loops with a runtime check if the
@@ -505,11 +505,11 @@ define void @test_scev_check_mul_add_expansion(ptr %out, ptr %in, i32 %len, i32
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[INDEX]], 6
; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[OFFSET_IDX]] to i64
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i16, ptr [[OUT]], i64 [[TMP6]]
-; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr [[TMP7]], align 2, !alias.scope [[META37:![0-9]+]], !noalias [[META40:![0-9]+]]
-; CHECK-NEXT: store i32 0, ptr [[IN]], align 4, !alias.scope [[META40]]
+; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr [[TMP7]], align 2, !alias.scope [[META36:![0-9]+]], !noalias [[META39:![0-9]+]]
+; CHECK-NEXT: store i32 0, ptr [[IN]], align 4, !alias.scope [[META39]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -524,7 +524,7 @@ define void @test_scev_check_mul_add_expansion(ptr %out, ptr %in, i32 %len, i32
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
; CHECK-NEXT: store i32 0, ptr [[IN]], align 4
; CHECK-NEXT: [[CMP7_NOT:%.*]] = icmp sgt i32 [[LEN]], [[IV_NEXT]]
-; CHECK-NEXT: br i1 [[CMP7_NOT]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP43:![0-9]+]]
+; CHECK-NEXT: br i1 [[CMP7_NOT]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP42:![0-9]+]]
; CHECK: exit:
; CHECK-NEXT: ret void
;
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-assume.ll b/llvm/test/Transforms/LoopVectorize/scalable-assume.ll
index ad8cd42..667df3a 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-assume.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-assume.ll
@@ -88,11 +88,11 @@ define void @test2(ptr %a, ptr noalias %b) {
; CHECK-NEXT: [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1600, [[TMP7]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1600, [[N_MOD_VF]]
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]])
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]])
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]])
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]])
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP12:%.*]] = shl nuw i64 [[TMP11]], 1
@@ -101,8 +101,6 @@ define void @test2(ptr %a, ptr noalias %b) {
; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 2 x float>, ptr [[TMP13]], align 4
; CHECK-NEXT: [[TMP14:%.*]] = fadd <vscale x 2 x float> [[WIDE_LOAD]], splat (float 1.000000e+00)
; CHECK-NEXT: [[TMP15:%.*]] = fadd <vscale x 2 x float> [[WIDE_LOAD3]], splat (float 1.000000e+00)
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]])
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]])
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP18:%.*]] = shl nuw i64 [[TMP17]], 1
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
index 9ed2240..9357adf 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
@@ -273,3 +273,106 @@ loop:
exit:
ret void
}
+
+define void @ld_div2_ld_scevunknown_nonuniform(ptr %src.a, ptr noalias %src.b, ptr noalias %dst) {
+; CHECK-LABEL: define void @ld_div2_ld_scevunknown_nonuniform
+; CHECK-SAME: (ptr [[SRC_A:%.*]], ptr noalias [[SRC_B:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP3]]
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP4]]
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP5]]
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP7]]
+; CHECK-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP8]], align 4
+; CHECK-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP9]], align 4
+; CHECK-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP10]], align 4
+; CHECK-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP11]], align 4
+; CHECK-NEXT: [[TMP20:%.*]] = load i64, ptr [[TMP12]], align 4
+; CHECK-NEXT: [[TMP21:%.*]] = load i64, ptr [[TMP13]], align 4
+; CHECK-NEXT: [[TMP22:%.*]] = load i64, ptr [[TMP14]], align 4
+; CHECK-NEXT: [[TMP23:%.*]] = load i64, ptr [[TMP15]], align 4
+; CHECK-NEXT: [[TMP24:%.*]] = insertelement <8 x i64> poison, i64 [[TMP16]], i32 0
+; CHECK-NEXT: [[TMP25:%.*]] = insertelement <8 x i64> [[TMP24]], i64 [[TMP17]], i32 1
+; CHECK-NEXT: [[TMP26:%.*]] = insertelement <8 x i64> [[TMP25]], i64 [[TMP18]], i32 2
+; CHECK-NEXT: [[TMP27:%.*]] = insertelement <8 x i64> [[TMP26]], i64 [[TMP19]], i32 3
+; CHECK-NEXT: [[TMP28:%.*]] = insertelement <8 x i64> [[TMP27]], i64 [[TMP20]], i32 4
+; CHECK-NEXT: [[TMP29:%.*]] = insertelement <8 x i64> [[TMP28]], i64 [[TMP21]], i32 5
+; CHECK-NEXT: [[TMP30:%.*]] = insertelement <8 x i64> [[TMP29]], i64 [[TMP22]], i32 6
+; CHECK-NEXT: [[TMP31:%.*]] = insertelement <8 x i64> [[TMP30]], i64 [[TMP23]], i32 7
+; CHECK-NEXT: [[TMP32:%.*]] = udiv <8 x i64> [[TMP31]], splat (i64 2)
+; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i64> [[TMP32]], i32 0
+; CHECK-NEXT: [[TMP34:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP33]]
+; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i64> [[TMP32]], i32 1
+; CHECK-NEXT: [[TMP36:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP35]]
+; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i64> [[TMP32]], i32 2
+; CHECK-NEXT: [[TMP38:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP37]]
+; CHECK-NEXT: [[TMP39:%.*]] = extractelement <8 x i64> [[TMP32]], i32 3
+; CHECK-NEXT: [[TMP40:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP39]]
+; CHECK-NEXT: [[TMP41:%.*]] = extractelement <8 x i64> [[TMP32]], i32 4
+; CHECK-NEXT: [[TMP42:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP41]]
+; CHECK-NEXT: [[TMP43:%.*]] = extractelement <8 x i64> [[TMP32]], i32 5
+; CHECK-NEXT: [[TMP44:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP43]]
+; CHECK-NEXT: [[TMP45:%.*]] = extractelement <8 x i64> [[TMP32]], i32 6
+; CHECK-NEXT: [[TMP46:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP45]]
+; CHECK-NEXT: [[TMP47:%.*]] = extractelement <8 x i64> [[TMP32]], i32 7
+; CHECK-NEXT: [[TMP48:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP47]]
+; CHECK-NEXT: [[TMP49:%.*]] = load i32, ptr [[TMP34]], align 4
+; CHECK-NEXT: [[TMP50:%.*]] = load i32, ptr [[TMP36]], align 4
+; CHECK-NEXT: [[TMP51:%.*]] = load i32, ptr [[TMP38]], align 4
+; CHECK-NEXT: [[TMP52:%.*]] = load i32, ptr [[TMP40]], align 4
+; CHECK-NEXT: [[TMP53:%.*]] = load i32, ptr [[TMP42]], align 4
+; CHECK-NEXT: [[TMP54:%.*]] = load i32, ptr [[TMP44]], align 4
+; CHECK-NEXT: [[TMP55:%.*]] = load i32, ptr [[TMP46]], align 4
+; CHECK-NEXT: [[TMP56:%.*]] = load i32, ptr [[TMP48]], align 4
+; CHECK-NEXT: [[TMP57:%.*]] = insertelement <8 x i32> poison, i32 [[TMP49]], i32 0
+; CHECK-NEXT: [[TMP58:%.*]] = insertelement <8 x i32> [[TMP57]], i32 [[TMP50]], i32 1
+; CHECK-NEXT: [[TMP59:%.*]] = insertelement <8 x i32> [[TMP58]], i32 [[TMP51]], i32 2
+; CHECK-NEXT: [[TMP60:%.*]] = insertelement <8 x i32> [[TMP59]], i32 [[TMP52]], i32 3
+; CHECK-NEXT: [[TMP61:%.*]] = insertelement <8 x i32> [[TMP60]], i32 [[TMP53]], i32 4
+; CHECK-NEXT: [[TMP62:%.*]] = insertelement <8 x i32> [[TMP61]], i32 [[TMP54]], i32 5
+; CHECK-NEXT: [[TMP63:%.*]] = insertelement <8 x i32> [[TMP62]], i32 [[TMP55]], i32 6
+; CHECK-NEXT: [[TMP64:%.*]] = insertelement <8 x i32> [[TMP63]], i32 [[TMP56]], i32 7
+; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP0]]
+; CHECK-NEXT: store <8 x i32> [[TMP64]], ptr [[TMP65]], align 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT: [[TMP66:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT: br i1 [[TMP66]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: br label [[SCALAR_PH:%.*]]
+; CHECK: scalar.ph:
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %gep.a = getelementptr i32, ptr %src.a, i64 %iv
+ %load.a = load i64, ptr %gep.a
+ %d = udiv i64 %load.a, 2
+ %gep.b = getelementptr i32, ptr %src.b, i64 %d
+ %load.b = load i32, ptr %gep.b
+ %gep.dst = getelementptr i32, ptr %dst, i64 %iv
+ store i32 %load.b, ptr %gep.dst
+ %iv.next = add i64 %iv, 1
+ %exit.cond = icmp eq i64 %iv, 1000
+ br i1 %exit.cond, label %exit, label %loop
+
+exit:
+ ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/write-only.ll b/llvm/test/Transforms/LoopVectorize/write-only.ll
index cc21b94..8df71e83 100644
--- a/llvm/test/Transforms/LoopVectorize/write-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/write-only.ll
@@ -4,8 +4,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
;CHECK-LABEL: @read_mod_write_single_ptr(
;CHECK: load <4 x float>
-;CHECK: ret i32
-define i32 @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
+;CHECK: ret void
+define void @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
%1 = icmp sgt i32 %n, 0
br i1 %1, label %.lr.ph, label %._crit_edge
@@ -21,14 +21,14 @@ define i32 @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable
br i1 %exitcond, label %._crit_edge, label %.lr.ph
._crit_edge: ; preds = %.lr.ph, %0
- ret i32 undef
+ ret void
}
; Ensure that volatile stores are not vectorized.
; CHECK-LABEL: @read_mod_write_single_ptr_volatile_store(
; CHECK-NOT: store <4 x float>
-; CHECK: ret i32
-define i32 @read_mod_write_single_ptr_volatile_store(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
+; CHECK: ret void
+define void @read_mod_write_single_ptr_volatile_store(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
%1 = icmp sgt i32 %n, 0
br i1 %1, label %.lr.ph, label %._crit_edge
@@ -44,5 +44,5 @@ define i32 @read_mod_write_single_ptr_volatile_store(ptr nocapture %a, i32 %n) n
br i1 %exitcond, label %._crit_edge, label %.lr.ph
._crit_edge: ; preds = %.lr.ph, %0
- ret i32 undef
+ ret void
}
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
new file mode 100644
index 0000000..abd1d96
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
@@ -0,0 +1,365 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=lower-matrix-intrinsics,instcombine -data-layout='p:64:64' -fuse-matrix-use-loops=false -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s --check-prefix=PTR64
+; RUN: opt -passes=lower-matrix-intrinsics,instcombine -data-layout='p:32:32' -fuse-matrix-use-loops=false -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s --check-prefix=PTR32
+
+; REQUIRES: aarch64-registered-target
+
+; See the comment in `data-layout.ll` for an explanation.
+
+target triple = "aarch64-unknown-unknown"
+
+define void @multiply(ptr %A, ptr %B, ptr %C) {
+; PTR64-LABEL: @multiply(
+; PTR64-NEXT: entry:
+; PTR64-NEXT: [[STORE_BEGIN:%.*]] = ptrtoint ptr [[C:%.*]] to i64
+; PTR64-NEXT: [[STORE_END:%.*]] = add nuw nsw i64 [[STORE_BEGIN]], 128
+; PTR64-NEXT: [[LOAD_BEGIN:%.*]] = ptrtoint ptr [[A:%.*]] to i64
+; PTR64-NEXT: [[TMP0:%.*]] = icmp ugt i64 [[STORE_END]], [[LOAD_BEGIN]]
+; PTR64-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
+; PTR64: alias_cont:
+; PTR64-NEXT: [[LOAD_END:%.*]] = add nuw nsw i64 [[LOAD_BEGIN]], 128
+; PTR64-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[LOAD_END]], [[STORE_BEGIN]]
+; PTR64-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
+; PTR64: copy:
+; PTR64-NEXT: [[TMP2:%.*]] = alloca [16 x double], align 8
+; PTR64-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
+; PTR64-NEXT: br label [[NO_ALIAS]]
+; PTR64: no_alias:
+; PTR64-NEXT: [[TMP3:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[A]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ]
+; PTR64-NEXT: [[STORE_BEGIN4:%.*]] = ptrtoint ptr [[C]] to i64
+; PTR64-NEXT: [[STORE_END5:%.*]] = add nuw nsw i64 [[STORE_BEGIN4]], 128
+; PTR64-NEXT: [[LOAD_BEGIN6:%.*]] = ptrtoint ptr [[A]] to i64
+; PTR64-NEXT: [[TMP4:%.*]] = icmp ugt i64 [[STORE_END5]], [[LOAD_BEGIN6]]
+; PTR64-NEXT: br i1 [[TMP4]], label [[ALIAS_CONT1:%.*]], label [[NO_ALIAS3:%.*]]
+; PTR64: alias_cont1:
+; PTR64-NEXT: [[LOAD_END7:%.*]] = add nuw nsw i64 [[LOAD_BEGIN6]], 128
+; PTR64-NEXT: [[TMP5:%.*]] = icmp ugt i64 [[LOAD_END7]], [[STORE_BEGIN4]]
+; PTR64-NEXT: br i1 [[TMP5]], label [[COPY2:%.*]], label [[NO_ALIAS3]]
+; PTR64: copy2:
+; PTR64-NEXT: [[TMP6:%.*]] = alloca [16 x double], align 8
+; PTR64-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP6]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
+; PTR64-NEXT: br label [[NO_ALIAS3]]
+; PTR64: no_alias3:
+; PTR64-NEXT: [[TMP7:%.*]] = phi ptr [ [[A]], [[NO_ALIAS]] ], [ [[A]], [[ALIAS_CONT1]] ], [ [[TMP6]], [[COPY2]] ]
+; PTR64-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
+; PTR64-NEXT: [[VEC_GEP:%.*]] = getelementptr i8, ptr [[TMP3]], i64 32
+; PTR64-NEXT: [[COL_LOAD8:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
+; PTR64-NEXT: [[COL_LOAD9:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
+; PTR64-NEXT: [[VEC_GEP10:%.*]] = getelementptr i8, ptr [[TMP7]], i64 32
+; PTR64-NEXT: [[COL_LOAD11:%.*]] = load <2 x double>, ptr [[VEC_GEP10]], align 8
+; PTR64-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP8:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
+; PTR64-NEXT: [[SPLAT_SPLAT14:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP9:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT14]], <2 x double> [[TMP8]])
+; PTR64-NEXT: [[SPLAT_SPLAT17:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP10:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT17]]
+; PTR64-NEXT: [[SPLAT_SPLAT20:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP11:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT20]], <2 x double> [[TMP10]])
+; PTR64-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP3]], i64 64
+; PTR64-NEXT: [[COL_LOAD21:%.*]] = load <2 x double>, ptr [[TMP12]], align 8
+; PTR64-NEXT: [[VEC_GEP22:%.*]] = getelementptr i8, ptr [[TMP3]], i64 96
+; PTR64-NEXT: [[COL_LOAD23:%.*]] = load <2 x double>, ptr [[VEC_GEP22]], align 8
+; PTR64-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP7]], i64 16
+; PTR64-NEXT: [[COL_LOAD24:%.*]] = load <2 x double>, ptr [[TMP13]], align 8
+; PTR64-NEXT: [[VEC_GEP25:%.*]] = getelementptr i8, ptr [[TMP7]], i64 48
+; PTR64-NEXT: [[COL_LOAD26:%.*]] = load <2 x double>, ptr [[VEC_GEP25]], align 8
+; PTR64-NEXT: [[SPLAT_SPLAT30:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP14:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT30]], <2 x double> [[TMP9]])
+; PTR64-NEXT: [[SPLAT_SPLAT33:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP15:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT33]], <2 x double> [[TMP14]])
+; PTR64-NEXT: [[SPLAT_SPLAT37:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP16:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT37]], <2 x double> [[TMP11]])
+; PTR64-NEXT: [[SPLAT_SPLAT40:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP17:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT40]], <2 x double> [[TMP16]])
+; PTR64-NEXT: store <2 x double> [[TMP15]], ptr [[C]], align 8
+; PTR64-NEXT: [[VEC_GEP41:%.*]] = getelementptr i8, ptr [[C]], i64 32
+; PTR64-NEXT: store <2 x double> [[TMP17]], ptr [[VEC_GEP41]], align 8
+; PTR64-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP3]], i64 16
+; PTR64-NEXT: [[COL_LOAD42:%.*]] = load <2 x double>, ptr [[TMP18]], align 8
+; PTR64-NEXT: [[VEC_GEP43:%.*]] = getelementptr i8, ptr [[TMP3]], i64 48
+; PTR64-NEXT: [[COL_LOAD44:%.*]] = load <2 x double>, ptr [[VEC_GEP43]], align 8
+; PTR64-NEXT: [[COL_LOAD45:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
+; PTR64-NEXT: [[VEC_GEP46:%.*]] = getelementptr i8, ptr [[TMP7]], i64 32
+; PTR64-NEXT: [[COL_LOAD47:%.*]] = load <2 x double>, ptr [[VEC_GEP46]], align 8
+; PTR64-NEXT: [[SPLAT_SPLAT50:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP19:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT50]]
+; PTR64-NEXT: [[SPLAT_SPLAT53:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP20:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT53]], <2 x double> [[TMP19]])
+; PTR64-NEXT: [[SPLAT_SPLAT56:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP21:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT56]]
+; PTR64-NEXT: [[SPLAT_SPLAT59:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP22:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT59]], <2 x double> [[TMP21]])
+; PTR64-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[TMP3]], i64 80
+; PTR64-NEXT: [[COL_LOAD60:%.*]] = load <2 x double>, ptr [[TMP23]], align 8
+; PTR64-NEXT: [[VEC_GEP61:%.*]] = getelementptr i8, ptr [[TMP3]], i64 112
+; PTR64-NEXT: [[COL_LOAD62:%.*]] = load <2 x double>, ptr [[VEC_GEP61]], align 8
+; PTR64-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[TMP7]], i64 16
+; PTR64-NEXT: [[COL_LOAD63:%.*]] = load <2 x double>, ptr [[TMP24]], align 8
+; PTR64-NEXT: [[VEC_GEP64:%.*]] = getelementptr i8, ptr [[TMP7]], i64 48
+; PTR64-NEXT: [[COL_LOAD65:%.*]] = load <2 x double>, ptr [[VEC_GEP64]], align 8
+; PTR64-NEXT: [[SPLAT_SPLAT69:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP25:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT69]], <2 x double> [[TMP20]])
+; PTR64-NEXT: [[SPLAT_SPLAT72:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP26:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT72]], <2 x double> [[TMP25]])
+; PTR64-NEXT: [[SPLAT_SPLAT76:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP27:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT76]], <2 x double> [[TMP22]])
+; PTR64-NEXT: [[SPLAT_SPLAT79:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP28:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT79]], <2 x double> [[TMP27]])
+; PTR64-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[C]], i64 16
+; PTR64-NEXT: store <2 x double> [[TMP26]], ptr [[TMP29]], align 8
+; PTR64-NEXT: [[VEC_GEP80:%.*]] = getelementptr i8, ptr [[C]], i64 48
+; PTR64-NEXT: store <2 x double> [[TMP28]], ptr [[VEC_GEP80]], align 8
+; PTR64-NEXT: [[COL_LOAD81:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
+; PTR64-NEXT: [[VEC_GEP82:%.*]] = getelementptr i8, ptr [[TMP3]], i64 32
+; PTR64-NEXT: [[COL_LOAD83:%.*]] = load <2 x double>, ptr [[VEC_GEP82]], align 8
+; PTR64-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[TMP7]], i64 64
+; PTR64-NEXT: [[COL_LOAD84:%.*]] = load <2 x double>, ptr [[TMP30]], align 8
+; PTR64-NEXT: [[VEC_GEP85:%.*]] = getelementptr i8, ptr [[TMP7]], i64 96
+; PTR64-NEXT: [[COL_LOAD86:%.*]] = load <2 x double>, ptr [[VEC_GEP85]], align 8
+; PTR64-NEXT: [[SPLAT_SPLAT89:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP31:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT89]]
+; PTR64-NEXT: [[SPLAT_SPLAT92:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP32:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT92]], <2 x double> [[TMP31]])
+; PTR64-NEXT: [[SPLAT_SPLAT95:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP33:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT95]]
+; PTR64-NEXT: [[SPLAT_SPLAT98:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP34:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT98]], <2 x double> [[TMP33]])
+; PTR64-NEXT: [[TMP35:%.*]] = getelementptr i8, ptr [[TMP3]], i64 64
+; PTR64-NEXT: [[COL_LOAD99:%.*]] = load <2 x double>, ptr [[TMP35]], align 8
+; PTR64-NEXT: [[VEC_GEP100:%.*]] = getelementptr i8, ptr [[TMP3]], i64 96
+; PTR64-NEXT: [[COL_LOAD101:%.*]] = load <2 x double>, ptr [[VEC_GEP100]], align 8
+; PTR64-NEXT: [[TMP36:%.*]] = getelementptr i8, ptr [[TMP7]], i64 80
+; PTR64-NEXT: [[COL_LOAD102:%.*]] = load <2 x double>, ptr [[TMP36]], align 8
+; PTR64-NEXT: [[VEC_GEP103:%.*]] = getelementptr i8, ptr [[TMP7]], i64 112
+; PTR64-NEXT: [[COL_LOAD104:%.*]] = load <2 x double>, ptr [[VEC_GEP103]], align 8
+; PTR64-NEXT: [[SPLAT_SPLAT108:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP37:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT108]], <2 x double> [[TMP32]])
+; PTR64-NEXT: [[SPLAT_SPLAT111:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP38:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT111]], <2 x double> [[TMP37]])
+; PTR64-NEXT: [[SPLAT_SPLAT115:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP39:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT115]], <2 x double> [[TMP34]])
+; PTR64-NEXT: [[SPLAT_SPLAT118:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP40:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT118]], <2 x double> [[TMP39]])
+; PTR64-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr [[C]], i64 64
+; PTR64-NEXT: store <2 x double> [[TMP38]], ptr [[TMP41]], align 8
+; PTR64-NEXT: [[VEC_GEP119:%.*]] = getelementptr i8, ptr [[C]], i64 96
+; PTR64-NEXT: store <2 x double> [[TMP40]], ptr [[VEC_GEP119]], align 8
+; PTR64-NEXT: [[TMP42:%.*]] = getelementptr i8, ptr [[TMP3]], i64 16
+; PTR64-NEXT: [[COL_LOAD120:%.*]] = load <2 x double>, ptr [[TMP42]], align 8
+; PTR64-NEXT: [[VEC_GEP121:%.*]] = getelementptr i8, ptr [[TMP3]], i64 48
+; PTR64-NEXT: [[COL_LOAD122:%.*]] = load <2 x double>, ptr [[VEC_GEP121]], align 8
+; PTR64-NEXT: [[TMP43:%.*]] = getelementptr i8, ptr [[TMP7]], i64 64
+; PTR64-NEXT: [[COL_LOAD123:%.*]] = load <2 x double>, ptr [[TMP43]], align 8
+; PTR64-NEXT: [[VEC_GEP124:%.*]] = getelementptr i8, ptr [[TMP7]], i64 96
+; PTR64-NEXT: [[COL_LOAD125:%.*]] = load <2 x double>, ptr [[VEC_GEP124]], align 8
+; PTR64-NEXT: [[SPLAT_SPLAT128:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP44:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT128]]
+; PTR64-NEXT: [[SPLAT_SPLAT131:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP45:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT131]], <2 x double> [[TMP44]])
+; PTR64-NEXT: [[SPLAT_SPLAT134:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP46:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT134]]
+; PTR64-NEXT: [[SPLAT_SPLAT137:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP47:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT137]], <2 x double> [[TMP46]])
+; PTR64-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr [[TMP3]], i64 80
+; PTR64-NEXT: [[COL_LOAD138:%.*]] = load <2 x double>, ptr [[TMP48]], align 8
+; PTR64-NEXT: [[VEC_GEP139:%.*]] = getelementptr i8, ptr [[TMP3]], i64 112
+; PTR64-NEXT: [[COL_LOAD140:%.*]] = load <2 x double>, ptr [[VEC_GEP139]], align 8
+; PTR64-NEXT: [[TMP49:%.*]] = getelementptr i8, ptr [[TMP7]], i64 80
+; PTR64-NEXT: [[COL_LOAD141:%.*]] = load <2 x double>, ptr [[TMP49]], align 8
+; PTR64-NEXT: [[VEC_GEP142:%.*]] = getelementptr i8, ptr [[TMP7]], i64 112
+; PTR64-NEXT: [[COL_LOAD143:%.*]] = load <2 x double>, ptr [[VEC_GEP142]], align 8
+; PTR64-NEXT: [[SPLAT_SPLAT147:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP50:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT147]], <2 x double> [[TMP45]])
+; PTR64-NEXT: [[SPLAT_SPLAT150:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP51:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT150]], <2 x double> [[TMP50]])
+; PTR64-NEXT: [[SPLAT_SPLAT154:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP52:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT154]], <2 x double> [[TMP47]])
+; PTR64-NEXT: [[SPLAT_SPLAT157:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP53:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT157]], <2 x double> [[TMP52]])
+; PTR64-NEXT: [[TMP54:%.*]] = getelementptr i8, ptr [[C]], i64 80
+; PTR64-NEXT: store <2 x double> [[TMP51]], ptr [[TMP54]], align 8
+; PTR64-NEXT: [[VEC_GEP158:%.*]] = getelementptr i8, ptr [[C]], i64 112
+; PTR64-NEXT: store <2 x double> [[TMP53]], ptr [[VEC_GEP158]], align 8
+; PTR64-NEXT: ret void
+;
+; PTR32-LABEL: @multiply(
+; PTR32-NEXT: entry:
+; PTR32-NEXT: [[STORE_BEGIN:%.*]] = ptrtoint ptr [[C:%.*]] to i32
+; PTR32-NEXT: [[STORE_END:%.*]] = add nuw nsw i32 [[STORE_BEGIN]], 128
+; PTR32-NEXT: [[LOAD_BEGIN:%.*]] = ptrtoint ptr [[A:%.*]] to i32
+; PTR32-NEXT: [[TMP0:%.*]] = icmp ugt i32 [[STORE_END]], [[LOAD_BEGIN]]
+; PTR32-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
+; PTR32: alias_cont:
+; PTR32-NEXT: [[LOAD_END:%.*]] = add nuw nsw i32 [[LOAD_BEGIN]], 128
+; PTR32-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[LOAD_END]], [[STORE_BEGIN]]
+; PTR32-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
+; PTR32: copy:
+; PTR32-NEXT: [[TMP2:%.*]] = alloca [16 x double], align 8
+; PTR32-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
+; PTR32-NEXT: br label [[NO_ALIAS]]
+; PTR32: no_alias:
+; PTR32-NEXT: [[TMP3:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[A]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ]
+; PTR32-NEXT: [[STORE_BEGIN4:%.*]] = ptrtoint ptr [[C]] to i32
+; PTR32-NEXT: [[STORE_END5:%.*]] = add nuw nsw i32 [[STORE_BEGIN4]], 128
+; PTR32-NEXT: [[LOAD_BEGIN6:%.*]] = ptrtoint ptr [[A]] to i32
+; PTR32-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[STORE_END5]], [[LOAD_BEGIN6]]
+; PTR32-NEXT: br i1 [[TMP4]], label [[ALIAS_CONT1:%.*]], label [[NO_ALIAS3:%.*]]
+; PTR32: alias_cont1:
+; PTR32-NEXT: [[LOAD_END7:%.*]] = add nuw nsw i32 [[LOAD_BEGIN6]], 128
+; PTR32-NEXT: [[TMP5:%.*]] = icmp ugt i32 [[LOAD_END7]], [[STORE_BEGIN4]]
+; PTR32-NEXT: br i1 [[TMP5]], label [[COPY2:%.*]], label [[NO_ALIAS3]]
+; PTR32: copy2:
+; PTR32-NEXT: [[TMP6:%.*]] = alloca [16 x double], align 8
+; PTR32-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP6]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
+; PTR32-NEXT: br label [[NO_ALIAS3]]
+; PTR32: no_alias3:
+; PTR32-NEXT: [[TMP7:%.*]] = phi ptr [ [[A]], [[NO_ALIAS]] ], [ [[A]], [[ALIAS_CONT1]] ], [ [[TMP6]], [[COPY2]] ]
+; PTR32-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
+; PTR32-NEXT: [[VEC_GEP:%.*]] = getelementptr i8, ptr [[TMP3]], i32 32
+; PTR32-NEXT: [[COL_LOAD8:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
+; PTR32-NEXT: [[COL_LOAD9:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
+; PTR32-NEXT: [[VEC_GEP10:%.*]] = getelementptr i8, ptr [[TMP7]], i32 32
+; PTR32-NEXT: [[COL_LOAD11:%.*]] = load <2 x double>, ptr [[VEC_GEP10]], align 8
+; PTR32-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP8:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
+; PTR32-NEXT: [[SPLAT_SPLAT14:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP9:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT14]], <2 x double> [[TMP8]])
+; PTR32-NEXT: [[SPLAT_SPLAT17:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP10:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT17]]
+; PTR32-NEXT: [[SPLAT_SPLAT20:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP11:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT20]], <2 x double> [[TMP10]])
+; PTR32-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP3]], i32 64
+; PTR32-NEXT: [[COL_LOAD21:%.*]] = load <2 x double>, ptr [[TMP12]], align 8
+; PTR32-NEXT: [[VEC_GEP22:%.*]] = getelementptr i8, ptr [[TMP3]], i32 96
+; PTR32-NEXT: [[COL_LOAD23:%.*]] = load <2 x double>, ptr [[VEC_GEP22]], align 8
+; PTR32-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP7]], i32 16
+; PTR32-NEXT: [[COL_LOAD24:%.*]] = load <2 x double>, ptr [[TMP13]], align 8
+; PTR32-NEXT: [[VEC_GEP25:%.*]] = getelementptr i8, ptr [[TMP7]], i32 48
+; PTR32-NEXT: [[COL_LOAD26:%.*]] = load <2 x double>, ptr [[VEC_GEP25]], align 8
+; PTR32-NEXT: [[SPLAT_SPLAT30:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP14:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT30]], <2 x double> [[TMP9]])
+; PTR32-NEXT: [[SPLAT_SPLAT33:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP15:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT33]], <2 x double> [[TMP14]])
+; PTR32-NEXT: [[SPLAT_SPLAT37:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP16:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT37]], <2 x double> [[TMP11]])
+; PTR32-NEXT: [[SPLAT_SPLAT40:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP17:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT40]], <2 x double> [[TMP16]])
+; PTR32-NEXT: store <2 x double> [[TMP15]], ptr [[C]], align 8
+; PTR32-NEXT: [[VEC_GEP41:%.*]] = getelementptr i8, ptr [[C]], i32 32
+; PTR32-NEXT: store <2 x double> [[TMP17]], ptr [[VEC_GEP41]], align 8
+; PTR32-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP3]], i32 16
+; PTR32-NEXT: [[COL_LOAD42:%.*]] = load <2 x double>, ptr [[TMP18]], align 8
+; PTR32-NEXT: [[VEC_GEP43:%.*]] = getelementptr i8, ptr [[TMP3]], i32 48
+; PTR32-NEXT: [[COL_LOAD44:%.*]] = load <2 x double>, ptr [[VEC_GEP43]], align 8
+; PTR32-NEXT: [[COL_LOAD45:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
+; PTR32-NEXT: [[VEC_GEP46:%.*]] = getelementptr i8, ptr [[TMP7]], i32 32
+; PTR32-NEXT: [[COL_LOAD47:%.*]] = load <2 x double>, ptr [[VEC_GEP46]], align 8
+; PTR32-NEXT: [[SPLAT_SPLAT50:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP19:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT50]]
+; PTR32-NEXT: [[SPLAT_SPLAT53:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP20:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT53]], <2 x double> [[TMP19]])
+; PTR32-NEXT: [[SPLAT_SPLAT56:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP21:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT56]]
+; PTR32-NEXT: [[SPLAT_SPLAT59:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP22:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT59]], <2 x double> [[TMP21]])
+; PTR32-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[TMP3]], i32 80
+; PTR32-NEXT: [[COL_LOAD60:%.*]] = load <2 x double>, ptr [[TMP23]], align 8
+; PTR32-NEXT: [[VEC_GEP61:%.*]] = getelementptr i8, ptr [[TMP3]], i32 112
+; PTR32-NEXT: [[COL_LOAD62:%.*]] = load <2 x double>, ptr [[VEC_GEP61]], align 8
+; PTR32-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[TMP7]], i32 16
+; PTR32-NEXT: [[COL_LOAD63:%.*]] = load <2 x double>, ptr [[TMP24]], align 8
+; PTR32-NEXT: [[VEC_GEP64:%.*]] = getelementptr i8, ptr [[TMP7]], i32 48
+; PTR32-NEXT: [[COL_LOAD65:%.*]] = load <2 x double>, ptr [[VEC_GEP64]], align 8
+; PTR32-NEXT: [[SPLAT_SPLAT69:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP25:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT69]], <2 x double> [[TMP20]])
+; PTR32-NEXT: [[SPLAT_SPLAT72:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP26:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT72]], <2 x double> [[TMP25]])
+; PTR32-NEXT: [[SPLAT_SPLAT76:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP27:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT76]], <2 x double> [[TMP22]])
+; PTR32-NEXT: [[SPLAT_SPLAT79:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP28:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT79]], <2 x double> [[TMP27]])
+; PTR32-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[C]], i32 16
+; PTR32-NEXT: store <2 x double> [[TMP26]], ptr [[TMP29]], align 8
+; PTR32-NEXT: [[VEC_GEP80:%.*]] = getelementptr i8, ptr [[C]], i32 48
+; PTR32-NEXT: store <2 x double> [[TMP28]], ptr [[VEC_GEP80]], align 8
+; PTR32-NEXT: [[COL_LOAD81:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
+; PTR32-NEXT: [[VEC_GEP82:%.*]] = getelementptr i8, ptr [[TMP3]], i32 32
+; PTR32-NEXT: [[COL_LOAD83:%.*]] = load <2 x double>, ptr [[VEC_GEP82]], align 8
+; PTR32-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[TMP7]], i32 64
+; PTR32-NEXT: [[COL_LOAD84:%.*]] = load <2 x double>, ptr [[TMP30]], align 8
+; PTR32-NEXT: [[VEC_GEP85:%.*]] = getelementptr i8, ptr [[TMP7]], i32 96
+; PTR32-NEXT: [[COL_LOAD86:%.*]] = load <2 x double>, ptr [[VEC_GEP85]], align 8
+; PTR32-NEXT: [[SPLAT_SPLAT89:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP31:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT89]]
+; PTR32-NEXT: [[SPLAT_SPLAT92:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP32:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT92]], <2 x double> [[TMP31]])
+; PTR32-NEXT: [[SPLAT_SPLAT95:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP33:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT95]]
+; PTR32-NEXT: [[SPLAT_SPLAT98:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP34:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT98]], <2 x double> [[TMP33]])
+; PTR32-NEXT: [[TMP35:%.*]] = getelementptr i8, ptr [[TMP3]], i32 64
+; PTR32-NEXT: [[COL_LOAD99:%.*]] = load <2 x double>, ptr [[TMP35]], align 8
+; PTR32-NEXT: [[VEC_GEP100:%.*]] = getelementptr i8, ptr [[TMP3]], i32 96
+; PTR32-NEXT: [[COL_LOAD101:%.*]] = load <2 x double>, ptr [[VEC_GEP100]], align 8
+; PTR32-NEXT: [[TMP36:%.*]] = getelementptr i8, ptr [[TMP7]], i32 80
+; PTR32-NEXT: [[COL_LOAD102:%.*]] = load <2 x double>, ptr [[TMP36]], align 8
+; PTR32-NEXT: [[VEC_GEP103:%.*]] = getelementptr i8, ptr [[TMP7]], i32 112
+; PTR32-NEXT: [[COL_LOAD104:%.*]] = load <2 x double>, ptr [[VEC_GEP103]], align 8
+; PTR32-NEXT: [[SPLAT_SPLAT108:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP37:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT108]], <2 x double> [[TMP32]])
+; PTR32-NEXT: [[SPLAT_SPLAT111:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP38:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT111]], <2 x double> [[TMP37]])
+; PTR32-NEXT: [[SPLAT_SPLAT115:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP39:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT115]], <2 x double> [[TMP34]])
+; PTR32-NEXT: [[SPLAT_SPLAT118:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP40:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT118]], <2 x double> [[TMP39]])
+; PTR32-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr [[C]], i32 64
+; PTR32-NEXT: store <2 x double> [[TMP38]], ptr [[TMP41]], align 8
+; PTR32-NEXT: [[VEC_GEP119:%.*]] = getelementptr i8, ptr [[C]], i32 96
+; PTR32-NEXT: store <2 x double> [[TMP40]], ptr [[VEC_GEP119]], align 8
+; PTR32-NEXT: [[TMP42:%.*]] = getelementptr i8, ptr [[TMP3]], i32 16
+; PTR32-NEXT: [[COL_LOAD120:%.*]] = load <2 x double>, ptr [[TMP42]], align 8
+; PTR32-NEXT: [[VEC_GEP121:%.*]] = getelementptr i8, ptr [[TMP3]], i32 48
+; PTR32-NEXT: [[COL_LOAD122:%.*]] = load <2 x double>, ptr [[VEC_GEP121]], align 8
+; PTR32-NEXT: [[TMP43:%.*]] = getelementptr i8, ptr [[TMP7]], i32 64
+; PTR32-NEXT: [[COL_LOAD123:%.*]] = load <2 x double>, ptr [[TMP43]], align 8
+; PTR32-NEXT: [[VEC_GEP124:%.*]] = getelementptr i8, ptr [[TMP7]], i32 96
+; PTR32-NEXT: [[COL_LOAD125:%.*]] = load <2 x double>, ptr [[VEC_GEP124]], align 8
+; PTR32-NEXT: [[SPLAT_SPLAT128:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP44:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT128]]
+; PTR32-NEXT: [[SPLAT_SPLAT131:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP45:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT131]], <2 x double> [[TMP44]])
+; PTR32-NEXT: [[SPLAT_SPLAT134:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP46:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT134]]
+; PTR32-NEXT: [[SPLAT_SPLAT137:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP47:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT137]], <2 x double> [[TMP46]])
+; PTR32-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr [[TMP3]], i32 80
+; PTR32-NEXT: [[COL_LOAD138:%.*]] = load <2 x double>, ptr [[TMP48]], align 8
+; PTR32-NEXT: [[VEC_GEP139:%.*]] = getelementptr i8, ptr [[TMP3]], i32 112
+; PTR32-NEXT: [[COL_LOAD140:%.*]] = load <2 x double>, ptr [[VEC_GEP139]], align 8
+; PTR32-NEXT: [[TMP49:%.*]] = getelementptr i8, ptr [[TMP7]], i32 80
+; PTR32-NEXT: [[COL_LOAD141:%.*]] = load <2 x double>, ptr [[TMP49]], align 8
+; PTR32-NEXT: [[VEC_GEP142:%.*]] = getelementptr i8, ptr [[TMP7]], i32 112
+; PTR32-NEXT: [[COL_LOAD143:%.*]] = load <2 x double>, ptr [[VEC_GEP142]], align 8
+; PTR32-NEXT: [[SPLAT_SPLAT147:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP50:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT147]], <2 x double> [[TMP45]])
+; PTR32-NEXT: [[SPLAT_SPLAT150:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP51:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT150]], <2 x double> [[TMP50]])
+; PTR32-NEXT: [[SPLAT_SPLAT154:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP52:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT154]], <2 x double> [[TMP47]])
+; PTR32-NEXT: [[SPLAT_SPLAT157:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP53:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT157]], <2 x double> [[TMP52]])
+; PTR32-NEXT: [[TMP54:%.*]] = getelementptr i8, ptr [[C]], i32 80
+; PTR32-NEXT: store <2 x double> [[TMP51]], ptr [[TMP54]], align 8
+; PTR32-NEXT: [[VEC_GEP158:%.*]] = getelementptr i8, ptr [[C]], i32 112
+; PTR32-NEXT: store <2 x double> [[TMP53]], ptr [[VEC_GEP158]], align 8
+; PTR32-NEXT: ret void
+;
+entry:
+ %a = load <16 x double>, ptr %A, align 8
+ %c = call <16 x double> @llvm.matrix.multiply(<16 x double> %a, <16 x double> %a, i32 4, i32 4, i32 4)
+ store <16 x double> %c, ptr %C, align 8
+ ret void
+}
+
+declare <16 x double> @llvm.matrix.multiply(<16 x double>, <16 x double>, i32, i32, i32)
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout.ll
new file mode 100644
index 0000000..3d05014
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout.ll
@@ -0,0 +1,154 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes='lower-matrix-intrinsics' -data-layout='p:64:64' -S < %s | FileCheck %s --check-prefix=PTR64
+; RUN: opt -passes='lower-matrix-intrinsics' -data-layout='p:32:32' -S < %s | FileCheck %s --check-prefix=PTR32
+
+; To properly support the matrix intrinsics on, e.g., 32-bit platforms (without
+; the need to emit `libc` calls), we perform strided index calculations using
+; the same pointer bit-width as the matrix pointers, as determined by the data
+; layout. To verify this behaviour, this test runs several strided loads and
+; stores through the lowering pass with (32|64)-bit pointers, and verifies the
+; generated code extends / truncates strides accordingly. Similarly,
+; `data-layout-multiply-fused.ll` adopts this approach to verify the same
+; behaviour for index calculations emitted while lowering fused matrix
+; multiplies.
+
+define <9 x double> @strided_load_3x3_i64(ptr %in, i64 %stride) {
+; PTR64-LABEL: @strided_load_3x3_i64(
+; PTR64-NEXT: entry:
+; PTR64-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]]
+; PTR64-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i64 [[VEC_START]]
+; PTR64-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR64-NEXT: [[VEC_START1:%.*]] = mul i64 1, [[STRIDE]]
+; PTR64-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START1]]
+; PTR64-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR64-NEXT: [[VEC_START4:%.*]] = mul i64 2, [[STRIDE]]
+; PTR64-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START4]]
+; PTR64-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR64-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR64-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR64-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR64-NEXT: ret <9 x double> [[TMP2]]
+;
+; PTR32-LABEL: @strided_load_3x3_i64(
+; PTR32-NEXT: entry:
+; PTR32-NEXT: [[STRIDE_CAST:%.*]] = trunc i64 [[STRIDE:%.*]] to i32
+; PTR32-NEXT: [[VEC_START:%.*]] = mul i32 0, [[STRIDE_CAST]]
+; PTR32-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i32 [[VEC_START]]
+; PTR32-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR32-NEXT: [[VEC_START1:%.*]] = mul i32 1, [[STRIDE_CAST]]
+; PTR32-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START1]]
+; PTR32-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR32-NEXT: [[VEC_START4:%.*]] = mul i32 2, [[STRIDE_CAST]]
+; PTR32-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START4]]
+; PTR32-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR32-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR32-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR32-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR32-NEXT: ret <9 x double> [[TMP2]]
+;
+entry:
+ %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i64(ptr %in, i64 %stride, i1 false, i32 3, i32 3)
+ ret <9 x double> %load
+}
+
+define <9 x double> @strided_load_3x3_const_stride_i64(ptr %in) {
+; PTR64-LABEL: @strided_load_3x3_const_stride_i64(
+; PTR64-NEXT: entry:
+; PTR64-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR64-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i64 16
+; PTR64-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR64-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 32
+; PTR64-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR64-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR64-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR64-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR64-NEXT: ret <9 x double> [[TMP2]]
+;
+; PTR32-LABEL: @strided_load_3x3_const_stride_i64(
+; PTR32-NEXT: entry:
+; PTR32-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR32-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i32 16
+; PTR32-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR32-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 32
+; PTR32-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR32-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR32-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR32-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR32-NEXT: ret <9 x double> [[TMP2]]
+;
+entry:
+ %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i64(ptr %in, i64 16, i1 false, i32 3, i32 3)
+ ret <9 x double> %load
+}
+
+define <9 x double> @strided_load_3x3_i32(ptr %in, i32 %stride) {
+; PTR64-LABEL: @strided_load_3x3_i32(
+; PTR64-NEXT: entry:
+; PTR64-NEXT: [[STRIDE_CAST:%.*]] = zext i32 [[STRIDE:%.*]] to i64
+; PTR64-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE_CAST]]
+; PTR64-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i64 [[VEC_START]]
+; PTR64-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR64-NEXT: [[VEC_START1:%.*]] = mul i64 1, [[STRIDE_CAST]]
+; PTR64-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START1]]
+; PTR64-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR64-NEXT: [[VEC_START4:%.*]] = mul i64 2, [[STRIDE_CAST]]
+; PTR64-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START4]]
+; PTR64-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR64-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR64-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR64-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR64-NEXT: ret <9 x double> [[TMP2]]
+;
+; PTR32-LABEL: @strided_load_3x3_i32(
+; PTR32-NEXT: entry:
+; PTR32-NEXT: [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]]
+; PTR32-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i32 [[VEC_START]]
+; PTR32-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR32-NEXT: [[VEC_START1:%.*]] = mul i32 1, [[STRIDE]]
+; PTR32-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START1]]
+; PTR32-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR32-NEXT: [[VEC_START4:%.*]] = mul i32 2, [[STRIDE]]
+; PTR32-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START4]]
+; PTR32-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR32-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR32-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR32-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR32-NEXT: ret <9 x double> [[TMP2]]
+;
+entry:
+ %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr %in, i32 %stride, i1 false, i32 3, i32 3)
+ ret <9 x double> %load
+}
+
+define <9 x double> @strided_load_3x3_const_stride_i32(ptr %in) {
+; PTR64-LABEL: @strided_load_3x3_const_stride_i32(
+; PTR64-NEXT: entry:
+; PTR64-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR64-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i64 16
+; PTR64-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR64-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 32
+; PTR64-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR64-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR64-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR64-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR64-NEXT: ret <9 x double> [[TMP2]]
+;
+; PTR32-LABEL: @strided_load_3x3_const_stride_i32(
+; PTR32-NEXT: entry:
+; PTR32-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR32-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i32 16
+; PTR32-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR32-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 32
+; PTR32-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR32-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR32-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR32-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR32-NEXT: ret <9 x double> [[TMP2]]
+;
+entry:
+ %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr %in, i32 16, i1 false, i32 3, i32 3)
+ ret <9 x double> %load
+}
+
+declare <9 x double> @llvm.matrix.column.major.load.v9f64.i64(ptr, i64, i1, i32, i32)
+declare <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr, i32, i1, i32, i32)
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder-rm.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder-rm.ll
new file mode 100644
index 0000000..4ec5898
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder-rm.ll
@@ -0,0 +1,96 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='lower-matrix-intrinsics' -matrix-default-layout=row-major -S < %s | FileCheck --check-prefix=SPLIT_REMAINDER %s
+; RUN: opt -passes='lower-matrix-intrinsics' -matrix-split-matmul-remainder-over-threshold=96 -matrix-default-layout=row-major -S < %s | FileCheck --check-prefix=NO_SPLIT_REMAINDER %s
+; RUN: opt -passes='lower-matrix-intrinsics' -matrix-split-matmul-remainder-over-threshold=64 -matrix-default-layout=row-major -S < %s | FileCheck --check-prefix=SPLIT_REMAINDER %s
+
+; REQUIRES: aarch64-registered-target
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:8:32:64-S128"
+target triple = "aarch64-apple-ios"
+
+define void @matmul(ptr %a, ptr %b, ptr %c) {
+; SPLIT_REMAINDER-LABEL: define void @matmul(
+; SPLIT_REMAINDER-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) {
+; SPLIT_REMAINDER-NEXT: [[COL_LOAD:%.*]] = load <3 x float>, ptr [[A]], align 4
+; SPLIT_REMAINDER-NEXT: [[COL_LOAD1:%.*]] = load <3 x float>, ptr [[B]], align 4
+; SPLIT_REMAINDER-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr [[B]], i64 3
+; SPLIT_REMAINDER-NEXT: [[COL_LOAD2:%.*]] = load <3 x float>, ptr [[VEC_GEP]], align 4
+; SPLIT_REMAINDER-NEXT: [[VEC_GEP3:%.*]] = getelementptr float, ptr [[B]], i64 6
+; SPLIT_REMAINDER-NEXT: [[COL_LOAD4:%.*]] = load <3 x float>, ptr [[VEC_GEP3]], align 4
+; SPLIT_REMAINDER-NEXT: [[BLOCK:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
+; SPLIT_REMAINDER-NEXT: [[TMP1:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 0
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i64 0
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[SPLAT_SPLAT]], [[BLOCK]]
+; SPLIT_REMAINDER-NEXT: [[BLOCK5:%.*]] = shufflevector <3 x float> [[COL_LOAD2]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
+; SPLIT_REMAINDER-NEXT: [[TMP3:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 1
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT6:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i64 0
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT7:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT6]], <2 x float> poison, <2 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[SPLAT_SPLAT7]], [[BLOCK5]]
+; SPLIT_REMAINDER-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[TMP2]], [[TMP4]]
+; SPLIT_REMAINDER-NEXT: [[BLOCK8:%.*]] = shufflevector <3 x float> [[COL_LOAD4]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
+; SPLIT_REMAINDER-NEXT: [[TMP6:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 2
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT9:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i64 0
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT10:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT9]], <2 x float> poison, <2 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT: [[TMP7:%.*]] = fmul <2 x float> [[SPLAT_SPLAT10]], [[BLOCK8]]
+; SPLIT_REMAINDER-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP5]], [[TMP7]]
+; SPLIT_REMAINDER-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; SPLIT_REMAINDER-NEXT: [[TMP10:%.*]] = shufflevector <3 x float> poison, <3 x float> [[TMP9]], <3 x i32> <i32 3, i32 4, i32 2>
+; SPLIT_REMAINDER-NEXT: [[BLOCK11:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <1 x i32> <i32 2>
+; SPLIT_REMAINDER-NEXT: [[TMP11:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 0
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT12:%.*]] = insertelement <1 x float> poison, float [[TMP11]], i64 0
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT13:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT12]], <1 x float> poison, <1 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT: [[TMP12:%.*]] = fmul <1 x float> [[SPLAT_SPLAT13]], [[BLOCK11]]
+; SPLIT_REMAINDER-NEXT: [[BLOCK14:%.*]] = shufflevector <3 x float> [[COL_LOAD2]], <3 x float> poison, <1 x i32> <i32 2>
+; SPLIT_REMAINDER-NEXT: [[TMP13:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 1
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT15:%.*]] = insertelement <1 x float> poison, float [[TMP13]], i64 0
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT16:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT15]], <1 x float> poison, <1 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT: [[TMP14:%.*]] = fmul <1 x float> [[SPLAT_SPLAT16]], [[BLOCK14]]
+; SPLIT_REMAINDER-NEXT: [[TMP15:%.*]] = fadd <1 x float> [[TMP12]], [[TMP14]]
+; SPLIT_REMAINDER-NEXT: [[BLOCK17:%.*]] = shufflevector <3 x float> [[COL_LOAD4]], <3 x float> poison, <1 x i32> <i32 2>
+; SPLIT_REMAINDER-NEXT: [[TMP16:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 2
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT18:%.*]] = insertelement <1 x float> poison, float [[TMP16]], i64 0
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT19:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT18]], <1 x float> poison, <1 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT: [[TMP17:%.*]] = fmul <1 x float> [[SPLAT_SPLAT19]], [[BLOCK17]]
+; SPLIT_REMAINDER-NEXT: [[TMP18:%.*]] = fadd <1 x float> [[TMP15]], [[TMP17]]
+; SPLIT_REMAINDER-NEXT: [[TMP19:%.*]] = shufflevector <1 x float> [[TMP18]], <1 x float> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
+; SPLIT_REMAINDER-NEXT: [[TMP20:%.*]] = shufflevector <3 x float> [[TMP10]], <3 x float> [[TMP19]], <3 x i32> <i32 0, i32 1, i32 3>
+; SPLIT_REMAINDER-NEXT: store <3 x float> [[TMP20]], ptr [[C]], align 4
+; SPLIT_REMAINDER-NEXT: ret void
+;
+; NO_SPLIT_REMAINDER-LABEL: define void @matmul(
+; NO_SPLIT_REMAINDER-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) {
+; NO_SPLIT_REMAINDER-NEXT: [[COL_LOAD:%.*]] = load <3 x float>, ptr [[A]], align 4
+; NO_SPLIT_REMAINDER-NEXT: [[COL_LOAD1:%.*]] = load <3 x float>, ptr [[B]], align 4
+; NO_SPLIT_REMAINDER-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr [[B]], i64 3
+; NO_SPLIT_REMAINDER-NEXT: [[COL_LOAD2:%.*]] = load <3 x float>, ptr [[VEC_GEP]], align 4
+; NO_SPLIT_REMAINDER-NEXT: [[VEC_GEP3:%.*]] = getelementptr float, ptr [[B]], i64 6
+; NO_SPLIT_REMAINDER-NEXT: [[COL_LOAD4:%.*]] = load <3 x float>, ptr [[VEC_GEP3]], align 4
+; NO_SPLIT_REMAINDER-NEXT: [[BLOCK:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NO_SPLIT_REMAINDER-NEXT: [[TMP1:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 0
+; NO_SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <3 x float> poison, float [[TMP1]], i64 0
+; NO_SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT]], <3 x float> poison, <3 x i32> zeroinitializer
+; NO_SPLIT_REMAINDER-NEXT: [[TMP2:%.*]] = fmul <3 x float> [[SPLAT_SPLAT]], [[BLOCK]]
+; NO_SPLIT_REMAINDER-NEXT: [[BLOCK5:%.*]] = shufflevector <3 x float> [[COL_LOAD2]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NO_SPLIT_REMAINDER-NEXT: [[TMP3:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 1
+; NO_SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT6:%.*]] = insertelement <3 x float> poison, float [[TMP3]], i64 0
+; NO_SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT7:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT6]], <3 x float> poison, <3 x i32> zeroinitializer
+; NO_SPLIT_REMAINDER-NEXT: [[TMP4:%.*]] = fmul <3 x float> [[SPLAT_SPLAT7]], [[BLOCK5]]
+; NO_SPLIT_REMAINDER-NEXT: [[TMP5:%.*]] = fadd <3 x float> [[TMP2]], [[TMP4]]
+; NO_SPLIT_REMAINDER-NEXT: [[BLOCK8:%.*]] = shufflevector <3 x float> [[COL_LOAD4]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NO_SPLIT_REMAINDER-NEXT: [[TMP6:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 2
+; NO_SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT9:%.*]] = insertelement <3 x float> poison, float [[TMP6]], i64 0
+; NO_SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT10:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT9]], <3 x float> poison, <3 x i32> zeroinitializer
+; NO_SPLIT_REMAINDER-NEXT: [[TMP7:%.*]] = fmul <3 x float> [[SPLAT_SPLAT10]], [[BLOCK8]]
+; NO_SPLIT_REMAINDER-NEXT: [[TMP8:%.*]] = fadd <3 x float> [[TMP5]], [[TMP7]]
+; NO_SPLIT_REMAINDER-NEXT: [[TMP9:%.*]] = shufflevector <3 x float> [[TMP8]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NO_SPLIT_REMAINDER-NEXT: [[TMP10:%.*]] = shufflevector <3 x float> poison, <3 x float> [[TMP9]], <3 x i32> <i32 3, i32 4, i32 5>
+; NO_SPLIT_REMAINDER-NEXT: store <3 x float> [[TMP10]], ptr [[C]], align 4
+; NO_SPLIT_REMAINDER-NEXT: ret void
+;
+ %a_load = load <3 x float>, ptr %a, align 4
+ %b_load = load <9 x float>, ptr %b, align 4
+ %matmul = tail call <3 x float> @llvm.matrix.multiply.v3f32.v9f32.v3f32(<3 x float> %a_load, <9 x float> %b_load, i32 1, i32 3, i32 3)
+ store <3 x float> %matmul, ptr %c, align 4
+ ret void
+}
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder.ll
new file mode 100644
index 0000000..fbc2cbc
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder.ll
@@ -0,0 +1,96 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck --check-prefix=SPLIT_REMAINDER %s
+; RUN: opt -passes='lower-matrix-intrinsics' -matrix-split-matmul-remainder-over-threshold=96 -S < %s | FileCheck --check-prefix=NO_SPLIT_REMAINDER %s
+; RUN: opt -passes='lower-matrix-intrinsics' -matrix-split-matmul-remainder-over-threshold=64 -S < %s | FileCheck --check-prefix=SPLIT_REMAINDER %s
+
+; REQUIRES: aarch64-registered-target
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:8:32:64-S128"
+target triple = "aarch64-apple-ios"
+
+define void @matmul(ptr %a, ptr %b, ptr %c) {
+; SPLIT_REMAINDER-LABEL: define void @matmul(
+; SPLIT_REMAINDER-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) {
+; SPLIT_REMAINDER-NEXT: [[COL_LOAD:%.*]] = load <3 x float>, ptr [[A]], align 4
+; SPLIT_REMAINDER-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr [[A]], i64 3
+; SPLIT_REMAINDER-NEXT: [[COL_LOAD1:%.*]] = load <3 x float>, ptr [[VEC_GEP]], align 4
+; SPLIT_REMAINDER-NEXT: [[VEC_GEP2:%.*]] = getelementptr float, ptr [[A]], i64 6
+; SPLIT_REMAINDER-NEXT: [[COL_LOAD3:%.*]] = load <3 x float>, ptr [[VEC_GEP2]], align 4
+; SPLIT_REMAINDER-NEXT: [[COL_LOAD4:%.*]] = load <3 x float>, ptr [[B]], align 4
+; SPLIT_REMAINDER-NEXT: [[BLOCK:%.*]] = shufflevector <3 x float> [[COL_LOAD]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
+; SPLIT_REMAINDER-NEXT: [[TMP1:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 0
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i64 0
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[BLOCK]], [[SPLAT_SPLAT]]
+; SPLIT_REMAINDER-NEXT: [[BLOCK5:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
+; SPLIT_REMAINDER-NEXT: [[TMP3:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 1
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT6:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i64 0
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT7:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT6]], <2 x float> poison, <2 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[BLOCK5]], [[SPLAT_SPLAT7]]
+; SPLIT_REMAINDER-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[TMP2]], [[TMP4]]
+; SPLIT_REMAINDER-NEXT: [[BLOCK8:%.*]] = shufflevector <3 x float> [[COL_LOAD3]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
+; SPLIT_REMAINDER-NEXT: [[TMP6:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 2
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT9:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i64 0
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT10:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT9]], <2 x float> poison, <2 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT: [[TMP7:%.*]] = fmul <2 x float> [[BLOCK8]], [[SPLAT_SPLAT10]]
+; SPLIT_REMAINDER-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP5]], [[TMP7]]
+; SPLIT_REMAINDER-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; SPLIT_REMAINDER-NEXT: [[TMP10:%.*]] = shufflevector <3 x float> poison, <3 x float> [[TMP9]], <3 x i32> <i32 3, i32 4, i32 2>
+; SPLIT_REMAINDER-NEXT: [[BLOCK11:%.*]] = shufflevector <3 x float> [[COL_LOAD]], <3 x float> poison, <1 x i32> <i32 2>
+; SPLIT_REMAINDER-NEXT: [[TMP11:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 0
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT12:%.*]] = insertelement <1 x float> poison, float [[TMP11]], i64 0
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT13:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT12]], <1 x float> poison, <1 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT: [[TMP12:%.*]] = fmul <1 x float> [[BLOCK11]], [[SPLAT_SPLAT13]]
+; SPLIT_REMAINDER-NEXT: [[BLOCK14:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <1 x i32> <i32 2>
+; SPLIT_REMAINDER-NEXT: [[TMP13:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 1
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT15:%.*]] = insertelement <1 x float> poison, float [[TMP13]], i64 0
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT16:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT15]], <1 x float> poison, <1 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT: [[TMP14:%.*]] = fmul <1 x float> [[BLOCK14]], [[SPLAT_SPLAT16]]
+; SPLIT_REMAINDER-NEXT: [[TMP15:%.*]] = fadd <1 x float> [[TMP12]], [[TMP14]]
+; SPLIT_REMAINDER-NEXT: [[BLOCK17:%.*]] = shufflevector <3 x float> [[COL_LOAD3]], <3 x float> poison, <1 x i32> <i32 2>
+; SPLIT_REMAINDER-NEXT: [[TMP16:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 2
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT18:%.*]] = insertelement <1 x float> poison, float [[TMP16]], i64 0
+; SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT19:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT18]], <1 x float> poison, <1 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT: [[TMP17:%.*]] = fmul <1 x float> [[BLOCK17]], [[SPLAT_SPLAT19]]
+; SPLIT_REMAINDER-NEXT: [[TMP18:%.*]] = fadd <1 x float> [[TMP15]], [[TMP17]]
+; SPLIT_REMAINDER-NEXT: [[TMP19:%.*]] = shufflevector <1 x float> [[TMP18]], <1 x float> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
+; SPLIT_REMAINDER-NEXT: [[TMP20:%.*]] = shufflevector <3 x float> [[TMP10]], <3 x float> [[TMP19]], <3 x i32> <i32 0, i32 1, i32 3>
+; SPLIT_REMAINDER-NEXT: store <3 x float> [[TMP20]], ptr [[C]], align 4
+; SPLIT_REMAINDER-NEXT: ret void
+;
+; NO_SPLIT_REMAINDER-LABEL: define void @matmul(
+; NO_SPLIT_REMAINDER-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) {
+; NO_SPLIT_REMAINDER-NEXT: [[COL_LOAD:%.*]] = load <3 x float>, ptr [[A]], align 4
+; NO_SPLIT_REMAINDER-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr [[A]], i64 3
+; NO_SPLIT_REMAINDER-NEXT: [[COL_LOAD1:%.*]] = load <3 x float>, ptr [[VEC_GEP]], align 4
+; NO_SPLIT_REMAINDER-NEXT: [[VEC_GEP2:%.*]] = getelementptr float, ptr [[A]], i64 6
+; NO_SPLIT_REMAINDER-NEXT: [[COL_LOAD3:%.*]] = load <3 x float>, ptr [[VEC_GEP2]], align 4
+; NO_SPLIT_REMAINDER-NEXT: [[COL_LOAD4:%.*]] = load <3 x float>, ptr [[B]], align 4
+; NO_SPLIT_REMAINDER-NEXT: [[BLOCK:%.*]] = shufflevector <3 x float> [[COL_LOAD]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NO_SPLIT_REMAINDER-NEXT: [[TMP1:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 0
+; NO_SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <3 x float> poison, float [[TMP1]], i64 0
+; NO_SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT]], <3 x float> poison, <3 x i32> zeroinitializer
+; NO_SPLIT_REMAINDER-NEXT: [[TMP2:%.*]] = fmul <3 x float> [[BLOCK]], [[SPLAT_SPLAT]]
+; NO_SPLIT_REMAINDER-NEXT: [[BLOCK5:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NO_SPLIT_REMAINDER-NEXT: [[TMP3:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 1
+; NO_SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT6:%.*]] = insertelement <3 x float> poison, float [[TMP3]], i64 0
+; NO_SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT7:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT6]], <3 x float> poison, <3 x i32> zeroinitializer
+; NO_SPLIT_REMAINDER-NEXT: [[TMP4:%.*]] = fmul <3 x float> [[BLOCK5]], [[SPLAT_SPLAT7]]
+; NO_SPLIT_REMAINDER-NEXT: [[TMP5:%.*]] = fadd <3 x float> [[TMP2]], [[TMP4]]
+; NO_SPLIT_REMAINDER-NEXT: [[BLOCK8:%.*]] = shufflevector <3 x float> [[COL_LOAD3]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NO_SPLIT_REMAINDER-NEXT: [[TMP6:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 2
+; NO_SPLIT_REMAINDER-NEXT: [[SPLAT_SPLATINSERT9:%.*]] = insertelement <3 x float> poison, float [[TMP6]], i64 0
+; NO_SPLIT_REMAINDER-NEXT: [[SPLAT_SPLAT10:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT9]], <3 x float> poison, <3 x i32> zeroinitializer
+; NO_SPLIT_REMAINDER-NEXT: [[TMP7:%.*]] = fmul <3 x float> [[BLOCK8]], [[SPLAT_SPLAT10]]
+; NO_SPLIT_REMAINDER-NEXT: [[TMP8:%.*]] = fadd <3 x float> [[TMP5]], [[TMP7]]
+; NO_SPLIT_REMAINDER-NEXT: [[TMP9:%.*]] = shufflevector <3 x float> [[TMP8]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NO_SPLIT_REMAINDER-NEXT: [[TMP10:%.*]] = shufflevector <3 x float> poison, <3 x float> [[TMP9]], <3 x i32> <i32 3, i32 4, i32 5>
+; NO_SPLIT_REMAINDER-NEXT: store <3 x float> [[TMP10]], ptr [[C]], align 4
+; NO_SPLIT_REMAINDER-NEXT: ret void
+;
+ %a_load = load <9 x float>, ptr %a, align 4
+ %b_load = load <3 x float>, ptr %b, align 4
+ %matmul = tail call <3 x float> @llvm.matrix.multiply.v9f32.v3f32.v3f32(<9 x float> %a_load, <3 x float> %b_load, i32 3, i32 3, i32 1)
+ store <3 x float> %matmul, ptr %c, align 4
+ ret void
+}
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll
index ae7da19..abc4705 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll
@@ -62,11 +62,12 @@ declare <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr, i32, i1, i32,
define <8 x double> @strided_load_4x2_stride_i32(ptr %in, i32 %stride) {
; CHECK-LABEL: @strided_load_4x2_stride_i32(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]]
-; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i32 [[VEC_START]]
+; CHECK-NEXT: [[STRIDE_CAST:%.*]] = zext i32 [[STRIDE:%.*]] to i64
+; CHECK-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE_CAST]]
+; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i64 [[VEC_START]]
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <4 x double>, ptr [[VEC_GEP]], align 8
-; CHECK-NEXT: [[VEC_START1:%.*]] = mul i32 1, [[STRIDE]]
-; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START1]]
+; CHECK-NEXT: [[VEC_START1:%.*]] = mul i64 1, [[STRIDE_CAST]]
+; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START1]]
; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <4 x double>, ptr [[VEC_GEP2]], align 8
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x double> [[COL_LOAD]], <4 x double> [[COL_LOAD4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: ret <8 x double> [[TMP0]]
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll
index 28e9cdb..81b8507 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll
@@ -34,11 +34,12 @@ define void @strided_store_3x2_nonconst_i32_stride(<6 x double> %in, i32 %stride
; CHECK-LABEL: @strided_store_3x2_nonconst_i32_stride(
; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <6 x double> [[IN:%.*]], <6 x double> poison, <3 x i32> <i32 0, i32 1, i32 2>
; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <6 x double> [[IN]], <6 x double> poison, <3 x i32> <i32 3, i32 4, i32 5>
-; CHECK-NEXT: [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]]
-; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[OUT:%.*]], i32 [[VEC_START]]
+; CHECK-NEXT: [[STRIDE_CAST:%.*]] = zext i32 [[STRIDE:%.*]] to i64
+; CHECK-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE_CAST]]
+; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[VEC_START]]
; CHECK-NEXT: store <3 x double> [[SPLIT]], ptr [[VEC_GEP]], align 8
-; CHECK-NEXT: [[VEC_START2:%.*]] = mul i32 1, [[STRIDE]]
-; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, ptr [[OUT]], i32 [[VEC_START2]]
+; CHECK-NEXT: [[VEC_START2:%.*]] = mul i64 1, [[STRIDE_CAST]]
+; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, ptr [[OUT]], i64 [[VEC_START2]]
; CHECK-NEXT: store <3 x double> [[SPLIT1]], ptr [[VEC_GEP3]], align 8
; CHECK-NEXT: ret void
;
diff --git a/llvm/test/Transforms/PGOProfile/data-access-profile.ll b/llvm/test/Transforms/PGOProfile/data-access-profile.ll
index 29198f34..205184b 100644
--- a/llvm/test/Transforms/PGOProfile/data-access-profile.ll
+++ b/llvm/test/Transforms/PGOProfile/data-access-profile.ll
@@ -3,55 +3,72 @@
; RUN: rm -rf %t && split-file %s %t && cd %t
-;; Read a text profile and merge it into indexed profile.
+;; Read text profiles and merge them into indexed profiles.
; RUN: llvm-profdata merge --memprof-version=4 memprof.yaml -o memprof.profdata
+; RUN: llvm-profdata merge --memprof-version=4 memprof-no-dap.yaml -o memprof-no-dap.profdata
;; Run optimizer pass on an IR module without IR functions, and test that global
;; variables in the module could be annotated (i.e., no early return),
; RUN: opt -passes='memprof-use<profile-filename=memprof.profdata>' -memprof-annotate-static-data-prefix \
-; RUN: -debug-only=memprof -stats -S funcless-module.ll -o - 2>&1 | FileCheck %s --check-prefixes=LOG,PREFIX,STAT
+; RUN: -debug-only=memprof -stats -S funcless-module.ll -o - 2>&1 | FileCheck %s --check-prefixes=LOG,IR,STAT
;; Run optimizer pass on the IR, and check the section prefix.
; RUN: opt -passes='memprof-use<profile-filename=memprof.profdata>' -memprof-annotate-static-data-prefix \
-; RUN: -debug-only=memprof -stats -S input.ll -o - 2>&1 | FileCheck %s --check-prefixes=LOG,PREFIX,STAT
+; RUN: -debug-only=memprof -stats -S input.ll -o - 2>&1 | FileCheck %s --check-prefixes=LOG,IR,STAT
-;; Run optimizer pass without explicitly setting -memprof-annotate-static-data-prefix.
-;; The output text IR shouldn't have `section_prefix`
+;; Run memprof without providing memprof data. Test that IR has module flag
+;; `EnableDataAccessProf` as 0.
+; RUN: opt -passes='memprof-use<profile-filename=memprof-no-dap.profdata>' -memprof-annotate-static-data-prefix \
+; RUN: -debug-only=memprof -stats -S input.ll -o - 2>&1 | FileCheck %s --check-prefix=FLAG
+
+;; Run memprof without explicitly setting -memprof-annotate-static-data-prefix.
+;; The output text IR shouldn't have `section_prefix` or EnableDataAccessProf module flag.
; RUN: opt -passes='memprof-use<profile-filename=memprof.profdata>' \
-; RUN: -debug-only=memprof -stats -S input.ll -o - | FileCheck %s --implicit-check-not="section_prefix"
+; RUN: -debug-only=memprof -stats -S input.ll -o - | FileCheck %s --check-prefix=FLAGLESS --implicit-check-not="section_prefix"
; LOG: Skip annotating string literal .str
; LOG: Global variable var1 is annotated as hot
; LOG: Global variable var2.llvm.125 is annotated as hot
; LOG: Global variable bar is not annotated
; LOG: Global variable foo is annotated as unlikely
-; LOG: Global variable var3 has explicit section name. Skip annotating.
-; LOG: Global variable var4 has explicit section name. Skip annotating.
+; LOG: Skip annotation for var3 due to explicit section name.
+; LOG: Skip annotation for var4 due to explicit section name.
+; LOG: Skip annotation for llvm.fake_var due to name starts with `llvm.`.
+; LOG: Skip annotation for qux due to linker declaration.
;; String literals are not annotated.
-; PREFIX: @.str = unnamed_addr constant [5 x i8] c"abcde"
-; PREFIX-NOT: section_prefix
-; PREFIX: @var1 = global i32 123, !section_prefix !0
+; IR: @.str = unnamed_addr constant [5 x i8] c"abcde"
+; IR-NOT: section_prefix
+; IR: @var1 = global i32 123, !section_prefix !0
;; @var.llvm.125 will be canonicalized to @var2 for profile look-up.
-; PREFIX-NEXT: @var2.llvm.125 = global i64 0, !section_prefix !0
+; IR-NEXT: @var2.llvm.125 = global i64 0, !section_prefix !0
;; @bar is not seen in hot symbol or known symbol set, so it won't get a section
;; prefix. Test this by testing that there is no section_prefix between @bar and
;; @foo.
-; PREFIX-NEXT: @bar = global i16 3
-; PREFIX-NOT: !section_prefix
+; IR-NEXT: @bar = global i16 3
+; IR-NOT: !section_prefix
;; @foo is unlikely.
-; PREFIX-NEXT: @foo = global i8 2, !section_prefix !1
+; IR-NEXT: @foo = global i8 2, !section_prefix !1
+
+; IR-NEXT: @var3 = constant [2 x i32] [i32 12345, i32 6789], section "sec1"
+; IR-NEXT: @var4 = constant [1 x i64] [i64 98765] #0
+
+; IR: @llvm.fake_var = global i32 123
+; IR-NOT: !section_prefix
+; IR: @qux = external global i64
+; IR-NOT: !section_prefix
-; PREFIX-NEXT: @var3 = constant [2 x i32] [i32 12345, i32 6789], section "sec1"
-; PREFIX-NEXT: @var4 = constant [1 x i64] [i64 98765] #0
+; IR: attributes #0 = { "rodata-section"="sec2" }
-; PREFIX: attributes #0 = { "rodata-section"="sec2" }
+; IR: !0 = !{!"section_prefix", !"hot"}
+; IR-NEXT: !1 = !{!"section_prefix", !"unlikely"}
+; IR-NEXT: !2 = !{i32 2, !"EnableDataAccessProf", i32 1}
-; PREFIX: !0 = !{!"section_prefix", !"hot"}
-; PREFIX-NEXT: !1 = !{!"section_prefix", !"unlikely"}
+; FLAG: !{i32 2, !"EnableDataAccessProf", i32 0}
+; FLAGLESS-NOT: EnableDataAccessProf
; STAT: 1 memprof - Number of global vars annotated with 'unlikely' section prefix.
; STAT: 2 memprof - Number of global vars with user-specified section (not annotated).
@@ -72,6 +89,24 @@ DataAccessProfiles:
- foo
KnownColdStrHashes: [ 999, 1001 ]
...
+;--- memprof-no-dap.yaml
+---
+# A memprof file with without data access profiles. The heap records are simplified
+# to pass profile parsing and don't need to match the IR.
+HeapProfileRecords:
+ - GUID: 0xdeadbeef12345678
+ AllocSites:
+ - Callstack:
+ - { Function: 0x1111111111111111, LineOffset: 11, Column: 10, IsInlineFrame: true }
+ MemInfoBlock:
+ AllocCount: 111
+ TotalSize: 222
+ TotalLifetime: 333
+ TotalLifetimeAccessDensity: 444
+ CallSites:
+ - Frames:
+ - { Function: 0x5555555555555555, LineOffset: 55, Column: 50, IsInlineFrame: true }
+...
;--- input.ll
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
@@ -84,11 +119,14 @@ target triple = "x86_64-unknown-linux-gnu"
@foo = global i8 2
@var3 = constant [2 x i32][i32 12345, i32 6789], section "sec1"
@var4 = constant [1 x i64][i64 98765] #0
+@llvm.fake_var = global i32 123
+@qux = external global i64
define i32 @func() {
%a = load i32, ptr @var1
%b = load i32, ptr @var2.llvm.125
- %ret = call i32 (...) @func_taking_arbitrary_param(i32 %a, i32 %b)
+ %c = load i32, ptr @llvm.fake_var
+ %ret = call i32 (...) @func_taking_arbitrary_param(i32 %a, i32 %b, i32 %c)
ret i32 %ret
}
@@ -108,5 +146,8 @@ target triple = "x86_64-unknown-linux-gnu"
@foo = global i8 2
@var3 = constant [2 x i32][i32 12345, i32 6789], section "sec1"
@var4 = constant [1 x i64][i64 98765] #0
+@llvm.fake_var = global i32 123
+@qux = external global i64
+
attributes #0 = { "rodata-section"="sec2" }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll
index c5f72f2..fded7a4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll
@@ -4,21 +4,9 @@
define i32 @crash_reordering_undefs() {
; CHECK-LABEL: @crash_reordering_undefs(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[OR0:%.*]] = or i64 undef, undef
-; CHECK-NEXT: [[CMP0:%.*]] = icmp eq i64 undef, [[OR0]]
-; CHECK-NEXT: [[ADD0:%.*]] = select i1 [[CMP0]], i32 65536, i32 65537
-; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i64 undef, undef
-; CHECK-NEXT: [[ADD2:%.*]] = select i1 [[CMP1]], i32 65536, i32 65537
-; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i64 undef, undef
-; CHECK-NEXT: [[ADD4:%.*]] = select i1 [[CMP2]], i32 65536, i32 65537
-; CHECK-NEXT: [[OR1:%.*]] = or i64 undef, undef
-; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i64 undef, [[OR1]]
-; CHECK-NEXT: [[ADD9:%.*]] = select i1 [[CMP3]], i32 65536, i32 65537
+; CHECK-NEXT: [[ADD0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> splat (i32 65537))
; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 undef, [[ADD0]]
-; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[ADD2]], [[ADD4]]
-; CHECK-NEXT: [[OP_RDX2:%.*]] = add i32 [[OP_RDX]], [[OP_RDX1]]
-; CHECK-NEXT: [[OP_RDX3:%.*]] = add i32 [[OP_RDX2]], [[ADD9]]
-; CHECK-NEXT: ret i32 [[OP_RDX3]]
+; CHECK-NEXT: ret i32 [[OP_RDX]]
;
entry:
%or0 = or i64 undef, undef
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-with-cmp-user.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-with-cmp-user.ll
index 3ac0d01..13b050d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gather-with-cmp-user.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-with-cmp-user.ll
@@ -6,15 +6,15 @@ define i1 @test(i32 %g, i16 %d) {
; CHECK-SAME: i32 [[G:%.*]], i16 [[D:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = and i16 [[D]], 1
-; CHECK-NEXT: [[XOR_I_I:%.*]] = xor i32 [[G]], 1
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[G]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[XOR_I_I]], i32 1
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP9:%.*]] = xor <2 x i32> [[TMP2]], <i32 0, i32 1>
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
; CHECK-NEXT: [[TMP4:%.*]] = trunc <2 x i32> [[TMP9]] to <2 x i8>
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i8> [[TMP4]], <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i8> [[TMP5]], <i8 -9, i8 -9, i8 -1, i8 -1>
; CHECK-NEXT: [[TMP7:%.*]] = icmp sgt <4 x i8> [[TMP6]], splat (i8 -3)
; CHECK-NEXT: [[TMP8:%.*]] = zext <4 x i1> [[TMP7]] to <4 x i8>
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
; CHECK-NEXT: [[TMP11:%.*]] = zext <4 x i8> [[TMP8]] to <4 x i32>
; CHECK-NEXT: [[TMP12:%.*]] = icmp sgt <4 x i32> [[TMP10]], [[TMP11]]
; CHECK-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP12]])
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/identity-match-splat-less-defined.ll b/llvm/test/Transforms/SLPVectorizer/X86/identity-match-splat-less-defined.ll
index f07424f..43302f2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/identity-match-splat-less-defined.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/identity-match-splat-less-defined.ll
@@ -3,32 +3,7 @@
define i32 @test() {
; CHECK-LABEL: define i32 @test() {
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 0, i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 poison>
-; CHECK-NEXT: [[TMP3:%.*]] = or <4 x i32> [[TMP2]], <i32 0, i32 0, i32 0, i32 poison>
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i32> [[TMP25]], zeroinitializer
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <24 x i32> <i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 3, i32 3, i32 3, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP5]], <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 7, i32 7, i32 7, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <64 x i32> [[TMP9]], <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 poison, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 48, i32 49, i32 50, i32 51, i32 poison, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
-; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <64 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <64 x i32> [[TMP10]], <64 x i32> [[TMP12]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 64, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 48, i32 49, i32 50, i32 51, i32 poison, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <64 x i32> [[TMP13]], <64 x i32> [[TMP15]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 48, i32 49, i32 50, i32 51, i32 67, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <24 x i32> [[TMP6]], <24 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <64 x i32> [[TMP16]], <64 x i32> [[TMP15]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <64 x i32> [[TMP27]], <64 x i32> [[TMP28]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[TMP19:%.*]] = icmp eq <64 x i32> zeroinitializer, [[TMP18]]
-; CHECK-NEXT: [[TMP20:%.*]] = icmp ne <64 x i32> zeroinitializer, [[TMP18]]
-; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <64 x i1> [[TMP19]], <64 x i1> [[TMP20]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[TMP22:%.*]] = zext <64 x i1> [[TMP21]] to <64 x i8>
-; CHECK-NEXT: [[TMP23:%.*]] = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> [[TMP22]])
+; CHECK-NEXT: [[TMP23:%.*]] = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
; CHECK-NEXT: [[TMP24:%.*]] = sext i8 [[TMP23]] to i32
; CHECK-NEXT: ret i32 [[TMP24]]
;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/last-non-copyable-inst-used-outside-bb.ll b/llvm/test/Transforms/SLPVectorizer/X86/last-non-copyable-inst-used-outside-bb.ll
new file mode 100644
index 0000000..2f97b41
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/last-non-copyable-inst-used-outside-bb.ll
@@ -0,0 +1,89 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-99999 < %s | FileCheck %s
+
+define void @test() {
+; CHECK-LABEL: define void @test() {
+; CHECK-NEXT: [[BB:.*]]:
+; CHECK-NEXT: br label %[[BB1:.*]]
+; CHECK: [[BB1]]:
+; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x i32> [ zeroinitializer, %[[BB]] ], [ [[TMP7:%.*]], %[[BB16:.*]] ], [ zeroinitializer, %[[BB1]] ]
+; CHECK-NEXT: br i1 false, label %[[BB1]], label %[[BB5:.*]]
+; CHECK: [[BB5]]:
+; CHECK-NEXT: [[PHI8:%.*]] = phi double [ 0.000000e+00, %[[BB16]] ], [ 0.000000e+00, %[[BB1]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = phi <4 x i32> [ [[TMP8:%.*]], %[[BB16]] ], [ <i32 poison, i32 poison, i32 0, i32 0>, %[[BB1]] ]
+; CHECK-NEXT: switch i32 0, label %[[BB21:.*]] [
+; CHECK-NEXT: i32 4, label %[[BB21]]
+; CHECK-NEXT: i32 1, label %[[BB21]]
+; CHECK-NEXT: i32 0, label %[[BB9:.*]]
+; CHECK-NEXT: ]
+; CHECK: [[BB9]]:
+; CHECK-NEXT: [[PHI13:%.*]] = phi double [ 0.000000e+00, %[[BB21]] ], [ 0.000000e+00, %[[BB5]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = phi <4 x i32> [ [[TMP1]], %[[BB21]] ], [ <i32 poison, i32 poison, i32 0, i32 0>, %[[BB5]] ]
+; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x i32> [ [[TMP9:%.*]], %[[BB21]] ], [ <i32 poison, i32 poison, i32 0, i32 0>, %[[BB5]] ]
+; CHECK-NEXT: switch i32 0, label %[[BB15:.*]] [
+; CHECK-NEXT: i32 1, label %[[BB14:.*]]
+; CHECK-NEXT: i32 0, label %[[BB16]]
+; CHECK-NEXT: ]
+; CHECK: [[BB14]]:
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT: br label %[[BB16]]
+; CHECK: [[BB15]]:
+; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> <i32 poison, i32 poison, i32 0, i32 0>, [[TMP2]]
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP5]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 7>
+; CHECK-NEXT: br label %[[BB16]]
+; CHECK: [[BB16]]:
+; CHECK-NEXT: [[PHI20:%.*]] = phi double [ 0.000000e+00, %[[BB15]] ], [ 0.000000e+00, %[[BB14]] ], [ 0.000000e+00, %[[BB9]] ]
+; CHECK-NEXT: [[TMP7]] = phi <4 x i32> [ [[TMP5]], %[[BB15]] ], [ [[TMP4]], %[[BB14]] ], [ <i32 poison, i32 poison, i32 0, i32 0>, %[[BB9]] ]
+; CHECK-NEXT: [[TMP8]] = phi <4 x i32> [ [[TMP6]], %[[BB15]] ], [ [[TMP3]], %[[BB14]] ], [ <i32 poison, i32 poison, i32 0, i32 0>, %[[BB9]] ]
+; CHECK-NEXT: br i1 false, label %[[BB5]], label %[[BB1]]
+; CHECK: [[BB21]]:
+; CHECK-NEXT: [[TMP9]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT: br label %[[BB9]]
+;
+bb:
+ br label %bb1
+
+bb1:
+ %phi = phi i32 [ 0, %bb ], [ 0, %bb1 ], [ %phi17, %bb16 ]
+ %phi2 = phi i32 [ 0, %bb ], [ 0, %bb1 ], [ %phi18, %bb16 ]
+ %phi3 = phi i32 [ 0, %bb ], [ poison, %bb16 ], [ 0, %bb1 ]
+ %phi4 = phi i32 [ 0, %bb ], [ poison, %bb16 ], [ 0, %bb1 ]
+ br i1 false, label %bb1, label %bb5
+
+bb5:
+ %phi6 = phi i32 [ %phi17, %bb16 ], [ 0, %bb1 ]
+ %phi7 = phi i32 [ %phi19, %bb16 ], [ 0, %bb1 ]
+ %phi8 = phi double [ 0.000000e+00, %bb16 ], [ 0.000000e+00, %bb1 ]
+ switch i32 0, label %bb21 [
+ i32 4, label %bb21
+ i32 1, label %bb21
+ i32 0, label %bb9
+ ]
+
+bb9:
+ %phi10 = phi i32 [ %phi6, %bb21 ], [ 0, %bb5 ]
+ %phi11 = phi i32 [ %phi7, %bb21 ], [ 0, %bb5 ]
+ %phi12 = phi i32 [ 0, %bb21 ], [ 0, %bb5 ]
+ %phi13 = phi double [ 0.000000e+00, %bb21 ], [ 0.000000e+00, %bb5 ]
+ switch i32 0, label %bb15 [
+ i32 1, label %bb14
+ i32 0, label %bb16
+ ]
+
+bb14:
+ br label %bb16
+
+bb15:
+ %add = add i32 0, %phi10
+ br label %bb16
+
+bb16:
+ %phi17 = phi i32 [ %add, %bb15 ], [ %phi10, %bb14 ], [ 0, %bb9 ]
+ %phi18 = phi i32 [ %phi11, %bb15 ], [ 0, %bb14 ], [ 0, %bb9 ]
+ %phi19 = phi i32 [ %phi12, %bb15 ], [ %phi12, %bb14 ], [ 0, %bb9 ]
+ %phi20 = phi double [ 0.000000e+00, %bb15 ], [ 0.000000e+00, %bb14 ], [ 0.000000e+00, %bb9 ]
+ br i1 false, label %bb5, label %bb1
+
+bb21:
+ br label %bb9
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll
index 1fedde4..3e9bd78 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll
@@ -3,12 +3,8 @@
define void @test() {
; CHECK-LABEL: define void @test() {
-; CHECK-NEXT: [[XOR108_I_I_I:%.*]] = xor i64 0, 1
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <12 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 poison, i64 0>, i64 [[XOR108_I_I_I]], i32 10
-; CHECK-NEXT: [[TMP2:%.*]] = lshr <12 x i64> [[TMP1]], zeroinitializer
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i64> poison, i64 [[XOR108_I_I_I]], i32 3
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <12 x i64> [[TMP2]], <12 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x i64> [[TMP5]], <16 x i64> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i64> poison, i64 1, i32 3
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 1, i64 0, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i64> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x i64> [[TMP6]], <16 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 3, i32 7, i32 8, i32 9, i32 3, i32 10, i32 11, i32 12, i32 3>
; CHECK-NEXT: [[TMP8:%.*]] = trunc <16 x i64> [[TMP7]] to <16 x i1>
; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i1> [[TMP8]], zeroinitializer
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll
index 034fe82..c5442b7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll
@@ -6,11 +6,10 @@
define void @foo() {
; CHECK-LABEL: define void @foo() {
; CHECK-NEXT: bb:
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 0, i32 0
; CHECK-NEXT: br label [[BB1:%.*]]
; CHECK: bb1:
; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, [[BB:%.*]] ], [ [[TMP6:%.*]], [[BB4:%.*]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = shl <2 x i32> [[TMP1]], [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = shl <2 x i32> [[TMP1]], zeroinitializer
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP1]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: [[TMP6]] = or <2 x i32> [[TMP5]], zeroinitializer
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0
@@ -24,11 +23,10 @@ define void @foo() {
;
; FORCED-LABEL: define void @foo() {
; FORCED-NEXT: bb:
-; FORCED-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 0, i32 0
; FORCED-NEXT: br label [[BB1:%.*]]
; FORCED: bb1:
; FORCED-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, [[BB:%.*]] ], [ [[TMP6:%.*]], [[BB4:%.*]] ]
-; FORCED-NEXT: [[TMP2:%.*]] = shl <2 x i32> [[TMP1]], [[TMP0]]
+; FORCED-NEXT: [[TMP2:%.*]] = shl <2 x i32> [[TMP1]], zeroinitializer
; FORCED-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP1]], <2 x i32> <i32 0, i32 3>
; FORCED-NEXT: [[TMP6]] = or <2 x i32> [[TMP5]], zeroinitializer
; FORCED-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll
index 2612a21..e8078ad 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll
@@ -5,23 +5,22 @@ define i32 @test(i1 %cond) {
; CHECK-LABEL: define i32 @test(
; CHECK-SAME: i1 [[COND:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[OR92:%.*]] = or i32 1, 0
; CHECK-NEXT: br label %[[BB:.*]]
; CHECK: [[BB]]:
-; CHECK-NEXT: [[P1:%.*]] = phi i32 [ [[OR92:%.*]], %[[BB]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[P1:%.*]] = phi i32 [ [[OR92]], %[[BB]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ [[TMP8:%.*]], %[[BB]] ], [ zeroinitializer, %[[ENTRY]] ]
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> <i32 poison, i32 poison, i32 0, i32 0>, <4 x i32> <i32 poison, i32 1, i32 6, i32 7>
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[P1]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP10]], <4 x i32> <i32 4, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i32> zeroinitializer, [[TMP4]]
-; CHECK-NEXT: [[OR92]] = or i32 1, 0
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP5]])
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> <i32 poison, i32 1>, i32 [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[OR92]], i32 0
-; CHECK-NEXT: [[TMP8]] = xor <2 x i32> [[TMP9]], [[TMP7]]
-; CHECK-NEXT: [[OP_RDX:%.*]] = xor i32 [[TMP6]], [[OR92]]
+; CHECK-NEXT: [[TMP8]] = xor <2 x i32> [[TMP9]], <i32 1, i32 0>
; CHECK-NEXT: br i1 [[COND]], label %[[EXIT:.*]], label %[[BB]]
; CHECK: [[EXIT]]:
+; CHECK-NEXT: [[OP_RDX:%.*]] = extractelement <2 x i32> [[TMP8]], i32 0
; CHECK-NEXT: ret i32 [[OP_RDX]]
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll b/llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll
index 4a5dd2a..b9f8390 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll
@@ -8,42 +8,21 @@ define i16 @test() {
; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 0
; CHECK-NEXT: [[CALL99_I:%.*]] = call i32 @llvm.bswap.i32(i32 0)
; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[CALL99_I]], 0
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP2]], i32 1
-; CHECK-NEXT: [[TMP5:%.*]] = and <2 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT: [[TMP6:%.*]] = shl i32 0, 0
-; CHECK-NEXT: [[UNSCLEAR186_I:%.*]] = and i32 [[TMP6]], 0
-; CHECK-NEXT: [[TMP7:%.*]] = shl i32 0, 0
; CHECK-NEXT: [[CALL7_I45:%.*]] = tail call i32 null(i32 0)
; CHECK-NEXT: [[TMP8:%.*]] = lshr i32 [[CALL7_I45]], 0
-; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
-; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP8]], i32 1
-; CHECK-NEXT: [[TMP11:%.*]] = and <2 x i32> [[TMP10]], zeroinitializer
-; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 0, 0
-; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[TMP12]], i32 0
-; CHECK-NEXT: [[TMP14:%.*]] = shl <2 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT: [[TMP15:%.*]] = and <2 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <24 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <24 x i32> <i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0>, <24 x i32> [[TMP16]], <24 x i32> <i32 0, i32 1, i32 24, i32 25, i32 poison, i32 5, i32 poison, i32 7, i32 poison, i32 poison, i32 poison, i32 11, i32 poison, i32 poison, i32 poison, i32 15, i32 poison, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 22, i32 23>
-; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <24 x i32> [[TMP17]], <24 x i32> <i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 24, i32 5, i32 26, i32 7, i32 28, i32 29, i32 poison, i32 11, i32 poison, i32 poison, i32 poison, i32 15, i32 poison, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 22, i32 23>
-; CHECK-NEXT: [[TMP19:%.*]] = insertelement <24 x i32> [[TMP18]], i32 [[UNSCLEAR186_I]], i32 10
-; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> poison, <24 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <24 x i32> [[TMP19]], <24 x i32> [[TMP20]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <24 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <24 x i32> [[TMP21]], <24 x i32> [[TMP22]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 24, i32 15, i32 25, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 22, i32 23>
-; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <24 x i32> [[TMP23]], <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 24, i32 25, i32 26, i32 27, i32 22, i32 23>
-; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <24 x i32> [[TMP24]], <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 0, i32 poison, i32 0, i32 0, i32 0, i32 poison, i32 0, i32 0, i32 0, i32 poison, i32 0, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <24 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 4, i32 30, i32 6, i32 32, i32 33, i32 34, i32 poison, i32 36, i32 37, i32 38, i32 poison, i32 40, i32 poison, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; CHECK-NEXT: [[TMP26:%.*]] = insertelement <24 x i32> [[TMP25]], i32 [[UNSCLEAR186_I]], i32 11
-; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <24 x i32> <i32 0, i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <24 x i32> [[TMP26]], <24 x i32> [[TMP27]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 24, i32 16, i32 26, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT: [[TMP29:%.*]] = icmp ne <24 x i32> [[TMP24]], [[TMP28]]
-; CHECK-NEXT: [[RDX_OP:%.*]] = shufflevector <24 x i1> [[TMP29]], <24 x i1> <i1 false, i1 false, i1 false, i1 false, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef>, <28 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
-; CHECK-NEXT: [[TMP30:%.*]] = bitcast <28 x i1> [[RDX_OP]] to i28
-; CHECK-NEXT: [[TMP31:%.*]] = call i28 @llvm.ctpop.i28(i28 [[TMP30]])
-; CHECK-NEXT: [[TMP32:%.*]] = trunc i28 [[TMP31]] to i16
-; CHECK-NEXT: [[TMP33:%.*]] = call i4 @llvm.ctpop.i4(i4 -8)
-; CHECK-NEXT: [[TMP34:%.*]] = zext i4 [[TMP33]] to i16
-; CHECK-NEXT: [[OP_RDX4:%.*]] = add i16 [[TMP34]], [[TMP32]]
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <28 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 poison>, i32 [[TMP1]], i32 4
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <28 x i32> [[TMP4]], i32 [[TMP2]], i32 5
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <28 x i32> [[TMP5]], <28 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <28 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 28, i32 29, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 poison>
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <28 x i32> [[TMP6]], i32 [[TMP8]], i32 12
+; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <28 x i32> [[TMP7]], <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <28 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 12, i32 28, i32 29, i32 30, i32 31, i32 poison, i32 poison, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 poison>
+; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <28 x i32> [[TMP16]], <28 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <28 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 28, i32 29, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <28 x i32> [[TMP9]], <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <28 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 28, i32 29, i32 30, i32 31, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
+; CHECK-NEXT: [[TMP11:%.*]] = and <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison>, [[TMP10]]
+; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <28 x i32> [[TMP11]], <28 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 9, i32 10, i32 10, i32 11, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
+; CHECK-NEXT: [[TMP13:%.*]] = icmp ne <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1>, [[TMP12]]
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <32 x i1> [[TMP13]] to i32
+; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP14]])
+; CHECK-NEXT: [[OP_RDX4:%.*]] = trunc i32 [[TMP15]] to i16
; CHECK-NEXT: ret i16 [[OP_RDX4]]
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/minbitwidth-node-with-multi-users.ll b/llvm/test/Transforms/SLPVectorizer/minbitwidth-node-with-multi-users.ll
index a7f8629..78708a2 100644
--- a/llvm/test/Transforms/SLPVectorizer/minbitwidth-node-with-multi-users.ll
+++ b/llvm/test/Transforms/SLPVectorizer/minbitwidth-node-with-multi-users.ll
@@ -6,20 +6,12 @@ define void @test() {
; CHECK-LABEL: define void @test() {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr null, align 2
-; CHECK-NEXT: [[TMP1:%.*]] = and i8 0, 1
; CHECK-NEXT: [[TMP2:%.*]] = and i32 0, 0
; CHECK-NEXT: [[TMP3:%.*]] = select i1 false, i32 0, i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i8> <i8 0, i8 poison, i8 poison, i8 poison>, i8 [[TMP1]], i32 1
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
-; CHECK-NEXT: [[TMP15:%.*]] = trunc <4 x i8> [[TMP5]] to <4 x i1>
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i8> [[TMP7]], zeroinitializer
+; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i8> zeroinitializer, zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = trunc <4 x i8> [[TMP8]] to <4 x i1>
-; CHECK-NEXT: [[TMP10:%.*]] = or <4 x i1> zeroinitializer, [[TMP15]]
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <4 x i1> [[TMP9]], [[TMP10]]
-; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i1> [[TMP15]] to <4 x i32>
-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 poison, i32 0, i32 0>, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
-; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i1> [[TMP9]], zeroinitializer
+; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP13]])
; CHECK-NEXT: [[OP_RDX:%.*]] = and i32 0, [[TMP14]]
; CHECK-NEXT: store i32 [[OP_RDX]], ptr null, align 4
diff --git a/llvm/test/Transforms/SROA/phi-and-select.ll b/llvm/test/Transforms/SROA/phi-and-select.ll
index 616617b..5d5a610 100644
--- a/llvm/test/Transforms/SROA/phi-and-select.ll
+++ b/llvm/test/Transforms/SROA/phi-and-select.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
; RUN: opt < %s -passes='sroa<preserve-cfg>' -S | FileCheck %s --check-prefixes=CHECK,CHECK-PRESERVE-CFG
; RUN: opt < %s -passes='sroa<modify-cfg>' -S | FileCheck %s --check-prefixes=CHECK,CHECK-MODIFY-CFG
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
@@ -36,11 +36,11 @@ exit:
ret i32 %result
}
-define i32 @test2() {
+define i32 @test2() !prof !0 {
; CHECK-LABEL: @test2(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[COND:%.*]] = icmp sle i32 0, 1
-; CHECK-NEXT: [[RESULT_SROA_SPECULATED:%.*]] = select i1 [[COND]], i32 1, i32 0
+; CHECK-NEXT: [[RESULT_SROA_SPECULATED:%.*]] = select i1 [[COND]], i32 1, i32 0, !prof [[PROF1:![0-9]+]]
; CHECK-NEXT: ret i32 [[RESULT_SROA_SPECULATED]]
;
entry:
@@ -53,7 +53,7 @@ entry:
%v1 = load i32, ptr %a1
%cond = icmp sle i32 %v0, %v1
- %select = select i1 %cond, ptr %a1, ptr %a
+ %select = select i1 %cond, ptr %a1, ptr %a, !prof !1
%result = load i32, ptr %select
ret i32 %result
@@ -870,3 +870,17 @@ define i8 @volatile_select(ptr %p, i1 %b) {
%v2 = load i8, ptr %px
ret i8 %v2
}
+
+!0 = !{!"function_entry_count", i32 10}
+!1 = !{!"branch_weights", i32 3, i32 5}
+;.
+; CHECK-PRESERVE-CFG: attributes #[[ATTR0:[0-9]+]] = { sanitize_address }
+;.
+; CHECK-MODIFY-CFG: attributes #[[ATTR0:[0-9]+]] = { sanitize_address }
+;.
+; CHECK-PRESERVE-CFG: [[META0:![0-9]+]] = !{!"function_entry_count", i32 10}
+; CHECK-PRESERVE-CFG: [[PROF1]] = !{!"branch_weights", i32 3, i32 5}
+;.
+; CHECK-MODIFY-CFG: [[META0:![0-9]+]] = !{!"function_entry_count", i32 10}
+; CHECK-MODIFY-CFG: [[PROF1]] = !{!"branch_weights", i32 3, i32 5}
+;.
diff --git a/llvm/test/Transforms/SROA/phi-gep.ll b/llvm/test/Transforms/SROA/phi-gep.ll
index 776624c..45c3bbd 100644
--- a/llvm/test/Transforms/SROA/phi-gep.ll
+++ b/llvm/test/Transforms/SROA/phi-gep.ll
@@ -1,9 +1,12 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
; RUN: opt -S -passes='sroa<preserve-cfg>' < %s | FileCheck %s --check-prefixes=CHECK,CHECK-PRESERVE-CFG
; RUN: opt -S -passes='sroa<modify-cfg>' < %s | FileCheck %s --check-prefixes=CHECK,CHECK-MODIFY-CFG
%pair = type { i32, i32 }
+;.
+; CHECK: @g = global %pair zeroinitializer, align 4
+;.
define i32 @test_sroa_phi_gep(i1 %cond) {
; CHECK-LABEL: @test_sroa_phi_gep(
; CHECK-NEXT: entry:
@@ -334,18 +337,18 @@ exit:
unreachable
}
-define void @test_sroa_gep_phi_select_same_block(i1 %c1, i1 %c2, ptr %ptr) {
+define void @test_sroa_gep_phi_select_same_block(i1 %c1, i1 %c2, ptr %ptr) !prof !0 {
; CHECK-LABEL: @test_sroa_gep_phi_select_same_block(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [[PAIR:%.*]], align 8
; CHECK-NEXT: br label [[WHILE_BODY:%.*]]
; CHECK: while.body:
; CHECK-NEXT: [[PHI:%.*]] = phi ptr [ [[ALLOCA]], [[ENTRY:%.*]] ], [ [[SELECT:%.*]], [[WHILE_BODY]] ]
-; CHECK-NEXT: [[SELECT]] = select i1 [[C1:%.*]], ptr [[PHI]], ptr [[PTR:%.*]]
+; CHECK-NEXT: [[SELECT]] = select i1 [[C1:%.*]], ptr [[PHI]], ptr [[PTR:%.*]], !prof [[PROF1:![0-9]+]]
; CHECK-NEXT: [[PHI_SROA_GEP:%.*]] = getelementptr inbounds [[PAIR]], ptr [[PHI]], i64 1
; CHECK-NEXT: [[PTR_SROA_GEP:%.*]] = getelementptr inbounds [[PAIR]], ptr [[PTR]], i64 1
-; CHECK-NEXT: [[SELECT_SROA_SEL:%.*]] = select i1 [[C1]], ptr [[PHI_SROA_GEP]], ptr [[PTR_SROA_GEP]]
-; CHECK-NEXT: br i1 [[C2:%.*]], label [[EXIT:%.*]], label [[WHILE_BODY]]
+; CHECK-NEXT: [[SELECT_SROA_SEL:%.*]] = select i1 [[C1]], ptr [[PHI_SROA_GEP]], ptr [[PTR_SROA_GEP]], !prof [[PROF1]]
+; CHECK-NEXT: br i1 [[C2:%.*]], label [[EXIT:%.*]], label [[WHILE_BODY]], !prof [[PROF2:![0-9]+]]
; CHECK: exit:
; CHECK-NEXT: ret void
;
@@ -355,9 +358,9 @@ entry:
while.body:
%phi = phi ptr [ %alloca, %entry ], [ %select, %while.body ]
- %select = select i1 %c1, ptr %phi, ptr %ptr
+ %select = select i1 %c1, ptr %phi, ptr %ptr, !prof !1
%gep = getelementptr inbounds %pair, ptr %select, i64 1
- br i1 %c2, label %exit, label %while.body
+ br i1 %c2, label %exit, label %while.body, !prof !2
exit:
ret void
@@ -747,6 +750,18 @@ declare ptr @foo()
declare i32 @__gxx_personality_v0(...)
declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
+
+!0 = !{!"function_entry_count", i32 10}
+!1 = !{!"branch_weights", i32 3, i32 5}
+!2 = !{!"branch_weights", i32 7, i32 11}
+
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+;.
+; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i32 10}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 3, i32 5}
+; CHECK: [[PROF2]] = !{!"branch_weights", i32 7, i32 11}
+;.
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; CHECK-MODIFY-CFG: {{.*}}
; CHECK-PRESERVE-CFG: {{.*}}
diff --git a/llvm/test/Transforms/SROA/select-gep.ll b/llvm/test/Transforms/SROA/select-gep.ll
index b48b0f7..a701d78 100644
--- a/llvm/test/Transforms/SROA/select-gep.ll
+++ b/llvm/test/Transforms/SROA/select-gep.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
; RUN: opt -S -passes='sroa<preserve-cfg>' < %s | FileCheck %s --check-prefixes=CHECK,CHECK-PRESERVE-CFG
; RUN: opt -S -passes='sroa<modify-cfg>' < %s | FileCheck %s --check-prefixes=CHECK,CHECK-MODIFY-CFG
@@ -203,10 +203,10 @@ define i32 @test_select_idx_mem2reg(i1 %c) {
; Test gep with a select-like zext index unfolding on an alloca that is
; splittable and promotable.
-define i64 @test_select_like_zext_idx_mem2reg(i1 %c) {
+define i64 @test_select_like_zext_idx_mem2reg(i1 %c) !prof !0 {
; CHECK-LABEL: @test_select_like_zext_idx_mem2reg(
; CHECK-NEXT: [[IDX:%.*]] = zext i1 [[C:%.*]] to i64
-; CHECK-NEXT: [[RES:%.*]] = select i1 [[C]], i64 2, i64 1
+; CHECK-NEXT: [[RES:%.*]] = select i1 [[C]], i64 2, i64 1, !prof [[PROF1:![0-9]+]]
; CHECK-NEXT: ret i64 [[RES]]
;
%alloca = alloca [2 x i64], align 8
@@ -352,3 +352,16 @@ define i32 @test_select_idx_not_constant3(i1 %c, ptr %p, i64 %arg) {
%res = load i32, ptr %gep, align 4
ret i32 %res
}
+
+!0 = !{!"function_entry_count", i32 10}
+;.
+; CHECK-PRESERVE-CFG: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+;.
+; CHECK-MODIFY-CFG: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+;.
+; CHECK-PRESERVE-CFG: [[META0:![0-9]+]] = !{!"function_entry_count", i32 10}
+; CHECK-PRESERVE-CFG: [[PROF1]] = !{!"unknown", !"sroa"}
+;.
+; CHECK-MODIFY-CFG: [[META0:![0-9]+]] = !{!"function_entry_count", i32 10}
+; CHECK-MODIFY-CFG: [[PROF1]] = !{!"unknown", !"sroa"}
+;.
diff --git a/llvm/test/Transforms/SROA/slice-width.ll b/llvm/test/Transforms/SROA/slice-width.ll
index eabb697..3b77e49 100644
--- a/llvm/test/Transforms/SROA/slice-width.ll
+++ b/llvm/test/Transforms/SROA/slice-width.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
; RUN: opt < %s -passes='sroa<preserve-cfg>' -S | FileCheck %s --check-prefixes=CHECK,CHECK-PRESERVE-CFG
; RUN: opt < %s -passes='sroa<modify-cfg>' -S | FileCheck %s --check-prefixes=CHECK,CHECK-MODIFY-CFG
target datalayout = "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-f80:128-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
@@ -8,6 +8,10 @@ declare void @llvm.memset.p0.i32(ptr nocapture, i8, i32, i1) nounwind
declare void @llvm.memset.p0.i64(ptr nocapture, i8, i64, i1) nounwind
; This tests that allocas are not split into slices that are not byte width multiple
+;.
+; CHECK: @foo_copy_source = external constant %union.Foo
+; CHECK: @i64_sink = global i64 0
+;.
define void @no_split_on_non_byte_width(i32) {
; CHECK-LABEL: @no_split_on_non_byte_width(
; CHECK-NEXT: [[ARG_SROA_0:%.*]] = alloca i8, align 8
@@ -92,12 +96,12 @@ declare i32 @memcpy_vec3float_helper(ptr)
; PR18726: Check that SROA does not rewrite a 12-byte memcpy into a 16-byte
; vector store, hence accidentally putting gibberish onto the stack.
-define i32 @memcpy_vec3float_widening(ptr %x) {
+define i32 @memcpy_vec3float_widening(ptr %x) !prof !0 {
; CHECK-LABEL: @memcpy_vec3float_widening(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP1_SROA_0_0_COPYLOAD:%.*]] = load <3 x float>, ptr [[X:%.*]], align 4
; CHECK-NEXT: [[TMP1_SROA_0_0_VEC_EXPAND:%.*]] = shufflevector <3 x float> [[TMP1_SROA_0_0_COPYLOAD]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-; CHECK-NEXT: [[TMP1_SROA_0_0_VECBLEND:%.*]] = select <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x float> [[TMP1_SROA_0_0_VEC_EXPAND]], <4 x float> undef
+; CHECK-NEXT: [[TMP1_SROA_0_0_VECBLEND:%.*]] = select <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x float> [[TMP1_SROA_0_0_VEC_EXPAND]], <4 x float> undef, !prof [[PROF1:![0-9]+]]
; CHECK-NEXT: [[TMP2:%.*]] = alloca [[S_VEC3FLOAT:%.*]], align 4
; CHECK-NEXT: [[TMP1_SROA_0_0_VEC_EXTRACT:%.*]] = shufflevector <4 x float> [[TMP1_SROA_0_0_VECBLEND]], <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
; CHECK-NEXT: store <3 x float> [[TMP1_SROA_0_0_VEC_EXTRACT]], ptr [[TMP2]], align 4
@@ -158,6 +162,15 @@ define i1 @presplit_overlarge_load() {
%L2 = load i1, ptr %A
ret i1 %L2
}
+!0 = !{!"function_entry_count", i32 10}
+
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
+;.
+; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i32 10}
+; CHECK: [[PROF1]] = !{!"unknown", !"sroa"}
+;.
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; CHECK-MODIFY-CFG: {{.*}}
; CHECK-PRESERVE-CFG: {{.*}}
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll b/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll
index f1ffcc7..239397b 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll
@@ -17,7 +17,7 @@
define void @_Z11hotFunctionbiiPiS_S_(i1 %cond, i32 %M, i32 %N, ptr %A, ptr %B, ptr %C) !prof !36 {
; CHECK-LABEL: define void @_Z11hotFunctionbiiPiS_S_
-; CHECK-SAME: (i1 [[COND:%.*]], i32 [[M:%.*]], i32 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) !prof [[PROF16:![0-9]+]] {
+; CHECK-SAME: (i1 [[COND:%.*]], i32 [[M:%.*]], i32 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) {{.*}}{
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP19_NOT:%.*]] = icmp eq i32 [[M]], 0
; CHECK-NEXT: br i1 [[CMP19_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER_LR_PH:%.*]], !prof [[PROF17:![0-9]+]]
diff --git a/llvm/test/Verifier/llvm.used-invalid-init.ll b/llvm/test/Verifier/llvm.used-invalid-init.ll
index 15a961c..38c84b15 100644
--- a/llvm/test/Verifier/llvm.used-invalid-init.ll
+++ b/llvm/test/Verifier/llvm.used-invalid-init.ll
@@ -2,5 +2,5 @@
@llvm.used = appending global [1 x ptr] zeroinitializer, section "llvm.metadata"
-; CHECK: wrong initalizer for intrinsic global variable
+; CHECK: wrong initializer for intrinsic global variable
; CHECK-NEXT: [1 x ptr] zeroinitializer
diff --git a/llvm/test/Verifier/matrix-intrinsics.ll b/llvm/test/Verifier/matrix-intrinsics.ll
index b6d5ad9..43d1a79 100644
--- a/llvm/test/Verifier/matrix-intrinsics.ll
+++ b/llvm/test/Verifier/matrix-intrinsics.ll
@@ -1,8 +1,7 @@
-; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+; RUN: not opt -S %s 2>&1 | FileCheck %s
define <4 x float> @transpose(<4 x float> %m, i32 %arg) {
-; CHECK: assembly parsed, but does not verify as correct!
-; CHECK-NEXT: Result of a matrix operation does not fit in the returned vector!
+; CHECK: Result of a matrix operation does not fit in the returned vector!
; CHECK-NEXT: Result of a matrix operation does not fit in the returned vector!
; CHECK-NEXT: Result of a matrix operation does not fit in the returned vector!
; CHECK-NEXT: immarg operand has non-immediate parameter
@@ -118,16 +117,34 @@ define void @column.major_store_stride_too_small(ptr %m, i64 %arg) {
ret void
}
+define <4 x float> @column.major_load_stride_i128(ptr %m, i32 %arg) {
+; CHECK-NEXT: Stride bitwidth cannot exceed 64!
+; CHECK-NEXT: ptr @llvm.matrix.column.major.load.v4f32.i128
+ %result.1 = call <4 x float> @llvm.matrix.column.major.load.v4f32.i128(ptr %m, i128 u0x10000000000000000, i1 false, i32 2, i32 2)
+ ret <4 x float> %result.1
+}
+
+define void @column.major_store_stride_i128(ptr %m, i64 %arg) {
+; CHECK-NEXT: Stride bitwidth cannot exceed 64!
+; CHECK-NEXT: ptr @llvm.matrix.column.major.store.v4f32.i128
+ call void @llvm.matrix.column.major.store.v4f32.i128(<4 x float> zeroinitializer, ptr %m, i128 u0x10000000000000000, i1 false, i32 2, i32 2)
+ ret void
+}
+
declare <4 x i32> @llvm.matrix.column.major.load.v4i32.i64(ptr, i64, i1, i32, i32)
declare <4 x float> @llvm.matrix.column.major.load.v4f32.p0(ptr, i64, i1, i32, i32)
declare <4 x float> @llvm.matrix.column.major.load.v4f32.i64(ptr, i64, i1, i32, i32)
declare <6 x float> @llvm.matrix.column.major.load.v6f32.i64(ptr, i64, i1, i32, i32)
+declare <6 x float> @llvm.matrix.column.major.load.v6f32.i8(ptr, i8, i1, i32, i32)
+declare <6 x float> @llvm.matrix.column.major.load.v6f32.i128(ptr, i28, i1, i32, i32)
declare void @llvm.matrix.column.major.store.v4f32.i64(<4 x float>, ptr, i64, i1, i32, i32)
declare void @llvm.matrix.column.major.store.v6f32.i64(<6 x float>, ptr, i64, i1, i32, i32)
declare void @llvm.matrix.column.major.store.v4i32.vi32(<4 x i32>, ptr, i64, i1, i32, i32)
declare void @llvm.matrix.column.major.store.v4f32.p0(<4 x float>, ptr, i64, i1, i32, i32)
declare void @llvm.matrix.column.major.store.v4p0.i64(<4 x ptr>, ptr, i64, i1, i32, i32)
+declare void @llvm.matrix.column.major.store.v4p0.i8(<4 x ptr>, ptr, i8, i1, i32, i32)
+declare void @llvm.matrix.column.major.store.v4p0.i128(<4 x ptr>, ptr, i128, i1, i32, i32)
declare <4 x i32> @llvm.matrix.transpose.v4i32.v4f32(<4 x float>, i32, i32)
declare <4 x float> @llvm.matrix.transpose.v4f32(<4 x float>, i32, i32)
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.generated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.generated.expected
index 429bee4..a8c2531 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.generated.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.generated.expected
@@ -65,8 +65,8 @@ define dso_local i32 @main() #0 {
attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="all" }
; CHECK-LABEL: check_boundaries:
-; CHECK: check_boundaries$local:
-; CHECK-NEXT: .type check_boundaries$local,@function
+; CHECK: .Lcheck_boundaries$local:
+; CHECK-NEXT: .type .Lcheck_boundaries$local,@function
; CHECK-NEXT: .cfi_startproc
; CHECK-NEXT: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -107,8 +107,8 @@ attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="all" }
; CHECK-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-LABEL: main:
-; CHECK: main$local:
-; CHECK-NEXT: .type main$local,@function
+; CHECK: .Lmain$local:
+; CHECK-NEXT: .type .Lmain$local,@function
; CHECK-NEXT: .cfi_startproc
; CHECK-NEXT: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.nogenerated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.nogenerated.expected
index 842fd88..34530f2 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.nogenerated.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.nogenerated.expected
@@ -6,8 +6,8 @@
define dso_local i32 @check_boundaries() #0 {
; CHECK-LABEL: check_boundaries:
-; CHECK: check_boundaries$local:
-; CHECK-NEXT: .type check_boundaries$local,@function
+; CHECK: .Lcheck_boundaries$local:
+; CHECK-NEXT: .type .Lcheck_boundaries$local,@function
; CHECK-NEXT: .cfi_startproc
; CHECK-NEXT: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -84,8 +84,8 @@ define dso_local i32 @check_boundaries() #0 {
define dso_local i32 @main() #0 {
; CHECK-LABEL: main:
-; CHECK: main$local:
-; CHECK-NEXT: .type main$local,@function
+; CHECK: .Lmain$local:
+; CHECK-NEXT: .type .Lmain$local,@function
; CHECK-NEXT: .cfi_startproc
; CHECK-NEXT: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFive7/vgather-vcompress.s b/llvm/test/tools/llvm-mca/RISCV/SiFive7/vrgather-vcompress.s
index 4ec1683..4ec1683 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFive7/vgather-vcompress.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFive7/vrgather-vcompress.s
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vgather-vcompress.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vrgather-vcompress.s
index 5ebed10..5ebed10 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vgather-vcompress.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vrgather-vcompress.s
diff --git a/llvm/test/tools/llvm-objcopy/DXContainer/dump-section-errors.yaml b/llvm/test/tools/llvm-objcopy/DXContainer/dump-section-errors.yaml
new file mode 100644
index 0000000..e748eecf
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/DXContainer/dump-section-errors.yaml
@@ -0,0 +1,27 @@
+# RUN: yaml2obj %s -o %t.dxbc
+# RUN: not llvm-objcopy --dump-section=FKE0=%t.fek0 %t.dxbc 2>&1 | FileCheck %s --check-prefix=CHECK-ZEROSIZE -DFILE=%t.fek0
+# RUN: not llvm-objcopy --dump-section=FKE3=%t.fek1 %t.dxbc 2>&1 | FileCheck %s --check-prefix=CHECK-MISSING -DFILE=%t.fek1
+# RUN: not llvm-objcopy --dump-section=FKE2=%t/does_not_exist/.fek2 %t.dxbc 2>&1 | FileCheck %s --check-prefix=CHECK-BAD-PATH -DFILE=%t/does_not_exist/.fek2 -DMSG=%errc_ENOENT
+
+# CHECK-ZEROSIZE: error: '[[FILE]]': part 'FKE0' is empty
+# CHECK-MISSING: error: '[[FILE]]': part 'FKE3' not found
+# CHECK-BAD-PATH: error: '[[FILE]]': [[MSG]]
+
+--- !dxcontainer
+Header:
+ Hash: [ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ]
+ Version:
+ Major: 1
+ Minor: 0
+ FileSize: 108
+ PartCount: 3
+ PartOffsets: [ 60, 68, 76 ]
+Parts:
+ - Name: FKE0
+ Size: 0
+ - Name: FKE1
+ Size: 0
+ - Name: FKE2
+ Size: 8
+...
diff --git a/llvm/test/tools/llvm-objcopy/DXContainer/dump-section.yaml b/llvm/test/tools/llvm-objcopy/DXContainer/dump-section.yaml
new file mode 100644
index 0000000..7d80a2c
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/DXContainer/dump-section.yaml
@@ -0,0 +1,278 @@
+# RUN: yaml2obj %s -o %t.dxbc
+# RUN: llvm-objcopy --dump-section=DXIL=%t.bc %t.dxbc
+# RUN: llvm-dis %t.bc -o - | FileCheck %s --check-prefix=BITCODE
+# RUN: wc -c %t.bc | FileCheck %s --check-prefix=DXIL-SIZE
+
+## Verify that when dumping the DXIL part we get a valid bitcode file.
+# BITCODE: define void @main()
+## Verify the size of the bitcode data.
+# DXIL-SIZE: 1708
+
+## Dump the PSV0 part and verify its size.
+# RUN: llvm-objcopy --dump-section=PSV0=%t.psv0 %t.dxbc
+# RUN: wc -c %t.psv0 | FileCheck %s --check-prefix=PSV0-SIZE
+# RUN: od -v -Ax -t x1 %t.psv0 | FileCheck %s --check-prefix=PSV0-CONTENTS
+# PSV0-SIZE: 76
+
+# For a compute shader the structure size is encoded followed by a bunch of 00'd
+# bytes until you get to the unused wave size min and max (0xffff), followed by
+# the shader stage (5 for compute).
+# TODO: Update this test to use objdump or obj2yaml once we support
+# --add-section in objcopy. See issue:
+# https://github.com/llvm/llvm-project/issues/162159.
+# PSV0-CONTENTS: 0000 34 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+# PSV0-CONTENTS: 0010 00 00 00 00 00 00 00 00 ff ff ff ff 05 00 00 00
+
+--- !dxcontainer
+Header:
+ Hash: [ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ]
+ Version:
+ Major: 1
+ Minor: 0
+ FileSize: 1872
+ PartCount: 2
+ PartOffsets: [ 40, 1780 ]
+Parts:
+ - Name: DXIL
+ Size: 1732
+ Program:
+ MajorVersion: 6
+ MinorVersion: 0
+ ShaderKind: 5
+ Size: 433
+ DXILMajorVersion: 1
+ DXILMinorVersion: 0
+ DXILSize: 1708
+ DXIL: [ 0x42, 0x43, 0xC0, 0xDE, 0x21, 0xC, 0x0, 0x0, 0xA8,
+ 0x1, 0x0, 0x0, 0xB, 0x82, 0x20, 0x0, 0x2, 0x0,
+ 0x0, 0x0, 0x13, 0x0, 0x0, 0x0, 0x7, 0x81, 0x23,
+ 0x91, 0x41, 0xC8, 0x4, 0x49, 0x6, 0x10, 0x32,
+ 0x39, 0x92, 0x1, 0x84, 0xC, 0x25, 0x5, 0x8, 0x19,
+ 0x1E, 0x4, 0x8B, 0x62, 0x80, 0x10, 0x45, 0x2,
+ 0x42, 0x92, 0xB, 0x42, 0x84, 0x10, 0x32, 0x14,
+ 0x38, 0x8, 0x18, 0x4B, 0xA, 0x32, 0x42, 0x88,
+ 0x48, 0x90, 0x14, 0x20, 0x43, 0x46, 0x88, 0xA5,
+ 0x0, 0x19, 0x32, 0x42, 0xE4, 0x48, 0xE, 0x90,
+ 0x11, 0x22, 0xC4, 0x50, 0x41, 0x51, 0x81, 0x8C,
+ 0xE1, 0x83, 0xE5, 0x8A, 0x4, 0x21, 0x46, 0x6,
+ 0x51, 0x18, 0x0, 0x0, 0x4, 0x0, 0x0, 0x0, 0x1B,
+ 0x90, 0xE0, 0xFF, 0xFF, 0xFF, 0xFF, 0x7, 0xC0,
+ 0x1, 0x24, 0x80, 0x2, 0x0, 0x0, 0x0, 0x49, 0x18,
+ 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x13, 0x82, 0x0,
+ 0x0, 0x89, 0x20, 0x0, 0x0, 0x11, 0x0, 0x0, 0x0,
+ 0x32, 0x22, 0x8, 0x9, 0x20, 0x64, 0x85, 0x4, 0x13,
+ 0x22, 0xA4, 0x84, 0x4, 0x13, 0x22, 0xE3, 0x84,
+ 0xA1, 0x90, 0x14, 0x12, 0x4C, 0x88, 0x8C, 0xB,
+ 0x84, 0x84, 0x4C, 0x10, 0x20, 0x73, 0x4, 0x8,
+ 0xC1, 0x65, 0xC3, 0x85, 0x2C, 0xE8, 0x3, 0x40,
+ 0x14, 0x91, 0x4E, 0xD1, 0x4A, 0x48, 0x44, 0x54,
+ 0x11, 0xC3, 0x9, 0x30, 0xC4, 0x18, 0x1, 0x30,
+ 0x2, 0x50, 0x82, 0x21, 0x1A, 0x8, 0x98, 0x23,
+ 0x0, 0x3, 0x0, 0x13, 0x14, 0x72, 0xC0, 0x87, 0x74,
+ 0x60, 0x87, 0x36, 0x68, 0x87, 0x79, 0x68, 0x3,
+ 0x72, 0xC0, 0x87, 0xD, 0xAE, 0x50, 0xE, 0x6D,
+ 0xD0, 0xE, 0x7A, 0x50, 0xE, 0x6D, 0x0, 0xF, 0x7A,
+ 0x30, 0x7, 0x72, 0xA0, 0x7, 0x73, 0x20, 0x7, 0x6D,
+ 0x90, 0xE, 0x71, 0xA0, 0x7, 0x73, 0x20, 0x7, 0x6D,
+ 0x90, 0xE, 0x78, 0xA0, 0x7, 0x78, 0xD0, 0x6, 0xE9,
+ 0x10, 0x7, 0x76, 0xA0, 0x7, 0x71, 0x60, 0x7, 0x6D,
+ 0x90, 0xE, 0x73, 0x20, 0x7, 0x7A, 0x30, 0x7, 0x72,
+ 0xD0, 0x6, 0xE9, 0x60, 0x7, 0x74, 0xA0, 0x7, 0x76,
+ 0x40, 0x7, 0x6D, 0x60, 0xE, 0x71, 0x60, 0x7, 0x7A,
+ 0x10, 0x7, 0x76, 0xD0, 0x6, 0xE6, 0x30, 0x7, 0x72,
+ 0xA0, 0x7, 0x73, 0x20, 0x7, 0x6D, 0x60, 0xE, 0x76,
+ 0x40, 0x7, 0x7A, 0x60, 0x7, 0x74, 0xD0, 0x6, 0xEE,
+ 0x80, 0x7, 0x7A, 0x10, 0x7, 0x76, 0xA0, 0x7, 0x73,
+ 0x20, 0x7, 0x7A, 0x60, 0x7, 0x74, 0x30, 0xE4,
+ 0x21, 0x0, 0x0, 0x8, 0x0, 0x0, 0x0, 0x2, 0x0,
+ 0x0, 0x0, 0x20, 0xB, 0x4, 0x7, 0x0, 0x0, 0x0,
+ 0x32, 0x1E, 0x98, 0xC, 0x19, 0x11, 0x4C, 0x90,
+ 0x8C, 0x9, 0x26, 0x47, 0xC6, 0x4, 0x43, 0xBA,
+ 0x12, 0x28, 0x88, 0x62, 0x18, 0x1, 0x28, 0x84,
+ 0x22, 0x0, 0x0, 0x0, 0x79, 0x18, 0x0, 0x0, 0xE5,
+ 0x0, 0x0, 0x0, 0x33, 0x8, 0x80, 0x1C, 0xC4, 0xE1,
+ 0x1C, 0x66, 0x14, 0x1, 0x3D, 0x88, 0x43, 0x38,
+ 0x84, 0xC3, 0x8C, 0x42, 0x80, 0x7, 0x79, 0x78,
+ 0x7, 0x73, 0x98, 0x71, 0xC, 0xE6, 0x0, 0xF, 0xED,
+ 0x10, 0xE, 0xF4, 0x80, 0xE, 0x33, 0xC, 0x42, 0x1E,
+ 0xC2, 0xC1, 0x1D, 0xCE, 0xA1, 0x1C, 0x66, 0x30,
+ 0x5, 0x3D, 0x88, 0x43, 0x38, 0x84, 0x83, 0x1B,
+ 0xCC, 0x3, 0x3D, 0xC8, 0x43, 0x3D, 0x8C, 0x3,
+ 0x3D, 0xCC, 0x78, 0x8C, 0x74, 0x70, 0x7, 0x7B,
+ 0x8, 0x7, 0x79, 0x48, 0x87, 0x70, 0x70, 0x7, 0x7A,
+ 0x70, 0x3, 0x76, 0x78, 0x87, 0x70, 0x20, 0x87,
+ 0x19, 0xCC, 0x11, 0xE, 0xEC, 0x90, 0xE, 0xE1,
+ 0x30, 0xF, 0x6E, 0x30, 0xF, 0xE3, 0xF0, 0xE, 0xF0,
+ 0x50, 0xE, 0x33, 0x10, 0xC4, 0x1D, 0xDE, 0x21,
+ 0x1C, 0xD8, 0x21, 0x1D, 0xC2, 0x61, 0x1E, 0x66,
+ 0x30, 0x89, 0x3B, 0xBC, 0x83, 0x3B, 0xD0, 0x43,
+ 0x39, 0xB4, 0x3, 0x3C, 0xBC, 0x83, 0x3C, 0x84,
+ 0x3, 0x3B, 0xCC, 0xF0, 0x14, 0x76, 0x60, 0x7,
+ 0x7B, 0x68, 0x7, 0x37, 0x68, 0x87, 0x72, 0x68,
+ 0x7, 0x37, 0x80, 0x87, 0x70, 0x90, 0x87, 0x70,
+ 0x60, 0x7, 0x76, 0x28, 0x7, 0x76, 0xF8, 0x5, 0x76,
+ 0x78, 0x87, 0x77, 0x80, 0x87, 0x5F, 0x8, 0x87,
+ 0x71, 0x18, 0x87, 0x72, 0x98, 0x87, 0x79, 0x98,
+ 0x81, 0x2C, 0xEE, 0xF0, 0xE, 0xEE, 0xE0, 0xE,
+ 0xF5, 0xC0, 0xE, 0xEC, 0x30, 0x3, 0x62, 0xC8,
+ 0xA1, 0x1C, 0xE4, 0xA1, 0x1C, 0xCC, 0xA1, 0x1C,
+ 0xE4, 0xA1, 0x1C, 0xDC, 0x61, 0x1C, 0xCA, 0x21,
+ 0x1C, 0xC4, 0x81, 0x1D, 0xCA, 0x61, 0x6, 0xD6,
+ 0x90, 0x43, 0x39, 0xC8, 0x43, 0x39, 0x98, 0x43,
+ 0x39, 0xC8, 0x43, 0x39, 0xB8, 0xC3, 0x38, 0x94,
+ 0x43, 0x38, 0x88, 0x3, 0x3B, 0x94, 0xC3, 0x2F,
+ 0xBC, 0x83, 0x3C, 0xFC, 0x82, 0x3B, 0xD4, 0x3,
+ 0x3B, 0xB0, 0xC3, 0xC, 0xC7, 0x69, 0x87, 0x70,
+ 0x58, 0x87, 0x72, 0x70, 0x83, 0x74, 0x68, 0x7,
+ 0x78, 0x60, 0x87, 0x74, 0x18, 0x87, 0x74, 0xA0,
+ 0x87, 0x19, 0xCE, 0x53, 0xF, 0xEE, 0x0, 0xF, 0xF2,
+ 0x50, 0xE, 0xE4, 0x90, 0xE, 0xE3, 0x40, 0xF, 0xE1,
+ 0x20, 0xE, 0xEC, 0x50, 0xE, 0x33, 0x20, 0x28,
+ 0x1D, 0xDC, 0xC1, 0x1E, 0xC2, 0x41, 0x1E, 0xD2,
+ 0x21, 0x1C, 0xDC, 0x81, 0x1E, 0xDC, 0xE0, 0x1C,
+ 0xE4, 0xE1, 0x1D, 0xEA, 0x1, 0x1E, 0x66, 0x18,
+ 0x51, 0x38, 0xB0, 0x43, 0x3A, 0x9C, 0x83, 0x3B,
+ 0xCC, 0x50, 0x24, 0x76, 0x60, 0x7, 0x7B, 0x68,
+ 0x7, 0x37, 0x60, 0x87, 0x77, 0x78, 0x7, 0x78,
+ 0x98, 0x51, 0x4C, 0xF4, 0x90, 0xF, 0xF0, 0x50,
+ 0xE, 0x33, 0x1E, 0x6A, 0x1E, 0xCA, 0x61, 0x1C,
+ 0xE8, 0x21, 0x1D, 0xDE, 0xC1, 0x1D, 0x7E, 0x1,
+ 0x1E, 0xE4, 0xA1, 0x1C, 0xCC, 0x21, 0x1D, 0xF0,
+ 0x61, 0x6, 0x54, 0x85, 0x83, 0x38, 0xCC, 0xC3,
+ 0x3B, 0xB0, 0x43, 0x3D, 0xD0, 0x43, 0x39, 0xFC,
+ 0xC2, 0x3C, 0xE4, 0x43, 0x3B, 0x88, 0xC3, 0x3B,
+ 0xB0, 0xC3, 0x8C, 0xC5, 0xA, 0x87, 0x79, 0x98,
+ 0x87, 0x77, 0x18, 0x87, 0x74, 0x8, 0x7, 0x7A,
+ 0x28, 0x7, 0x72, 0x98, 0x81, 0x5C, 0xE3, 0x10,
+ 0xE, 0xEC, 0xC0, 0xE, 0xE5, 0x50, 0xE, 0xF3, 0x30,
+ 0x23, 0xC1, 0xD2, 0x41, 0x1E, 0xE4, 0xE1, 0x17,
+ 0xD8, 0xE1, 0x1D, 0xDE, 0x1, 0x1E, 0x66, 0x48,
+ 0x19, 0x3B, 0xB0, 0x83, 0x3D, 0xB4, 0x83, 0x1B,
+ 0x84, 0xC3, 0x38, 0x8C, 0x43, 0x39, 0xCC, 0xC3,
+ 0x3C, 0xB8, 0xC1, 0x39, 0xC8, 0xC3, 0x3B, 0xD4,
+ 0x3, 0x3C, 0xCC, 0x48, 0xB4, 0x71, 0x8, 0x7, 0x76,
+ 0x60, 0x7, 0x71, 0x8, 0x87, 0x71, 0x58, 0x87,
+ 0x19, 0xDB, 0xC6, 0xE, 0xEC, 0x60, 0xF, 0xED,
+ 0xE0, 0x6, 0xF0, 0x20, 0xF, 0xE5, 0x30, 0xF, 0xE5,
+ 0x20, 0xF, 0xF6, 0x50, 0xE, 0x6E, 0x10, 0xE, 0xE3,
+ 0x30, 0xE, 0xE5, 0x30, 0xF, 0xF3, 0xE0, 0x6, 0xE9,
+ 0xE0, 0xE, 0xE4, 0x50, 0xE, 0xF8, 0x30, 0x23,
+ 0xE2, 0xEC, 0x61, 0x1C, 0xC2, 0x81, 0x1D, 0xD8,
+ 0xE1, 0x17, 0xEC, 0x21, 0x1D, 0xE6, 0x21, 0x1D,
+ 0xC4, 0x21, 0x1D, 0xD8, 0x21, 0x1D, 0xE8, 0x21,
+ 0x1F, 0x66, 0x20, 0x9D, 0x3B, 0xBC, 0x43, 0x3D,
+ 0xB8, 0x3, 0x39, 0x94, 0x83, 0x39, 0xCC, 0x58,
+ 0xBC, 0x70, 0x70, 0x7, 0x77, 0x78, 0x7, 0x7A,
+ 0x8, 0x7, 0x7A, 0x48, 0x87, 0x77, 0x70, 0x87,
+ 0x19, 0xCB, 0xE7, 0xE, 0xEF, 0x30, 0xF, 0xE1,
+ 0xE0, 0xE, 0xE9, 0x40, 0xF, 0xE9, 0xA0, 0xF, 0xE5,
+ 0x30, 0xC3, 0x1, 0x3, 0x73, 0xA8, 0x7, 0x77, 0x18,
+ 0x87, 0x5F, 0x98, 0x87, 0x70, 0x70, 0x87, 0x74,
+ 0xA0, 0x87, 0x74, 0xD0, 0x87, 0x72, 0x98, 0x81,
+ 0x84, 0x41, 0x39, 0xE0, 0xC3, 0x38, 0xB0, 0x43,
+ 0x3D, 0x90, 0x43, 0x39, 0xCC, 0x40, 0xC4, 0xA0,
+ 0x1D, 0xCA, 0xA1, 0x1D, 0xE0, 0x41, 0x1E, 0xDE,
+ 0xC1, 0x1C, 0x66, 0x24, 0x63, 0x30, 0xE, 0xE1,
+ 0xC0, 0xE, 0xEC, 0x30, 0xF, 0xE9, 0x40, 0xF, 0xE5,
+ 0x30, 0x43, 0x21, 0x83, 0x75, 0x18, 0x7, 0x73,
+ 0x48, 0x87, 0x5F, 0xA0, 0x87, 0x7C, 0x80, 0x87,
+ 0x72, 0x98, 0xB1, 0x94, 0x1, 0x3C, 0x8C, 0xC3,
+ 0x3C, 0x94, 0xC3, 0x38, 0xD0, 0x43, 0x3A, 0xBC,
+ 0x83, 0x3B, 0xCC, 0xC3, 0x8C, 0xC5, 0xC, 0x48,
+ 0x21, 0x15, 0x42, 0x61, 0x1E, 0xE6, 0x21, 0x1D,
+ 0xCE, 0xC1, 0x1D, 0x52, 0x81, 0x14, 0x66, 0x4C,
+ 0x67, 0x30, 0xE, 0xEF, 0x20, 0xF, 0xEF, 0xE0,
+ 0x6, 0xEF, 0x50, 0xF, 0xF4, 0x30, 0xF, 0xE9, 0x40,
+ 0xE, 0xE5, 0xE0, 0x6, 0xE6, 0x20, 0xF, 0xE1, 0xD0,
+ 0xE, 0xE5, 0x30, 0xA3, 0x40, 0x83, 0x76, 0x68,
+ 0x7, 0x79, 0x8, 0x87, 0x19, 0x52, 0x1A, 0xB8,
+ 0xC3, 0x3B, 0x84, 0x3, 0x3B, 0xA4, 0x43, 0x38,
+ 0xCC, 0x83, 0x1B, 0x84, 0x3, 0x39, 0x90, 0x83,
+ 0x3C, 0xCC, 0x3, 0x3C, 0x84, 0xC3, 0x38, 0x94,
+ 0xC3, 0xC, 0x46, 0xD, 0xC6, 0x21, 0x1C, 0xD8,
+ 0x81, 0x1D, 0xCA, 0xA1, 0x1C, 0x7E, 0x81, 0x1E,
+ 0xF2, 0x1, 0x1E, 0xCA, 0x61, 0xC6, 0xB1, 0x6,
+ 0xEE, 0xF0, 0xE, 0xE6, 0x20, 0xF, 0xE5, 0x50,
+ 0xE, 0x33, 0x1C, 0x36, 0x20, 0x7, 0x7C, 0x70,
+ 0x3, 0x77, 0x78, 0x7, 0x77, 0xA8, 0x7, 0x77, 0x48,
+ 0x7, 0x73, 0x78, 0x7, 0x79, 0x68, 0x87, 0x19,
+ 0x55, 0x1B, 0x90, 0x3, 0x3E, 0xB8, 0xC1, 0x38,
+ 0xBC, 0x83, 0x3B, 0xD0, 0x83, 0x3C, 0xBC, 0x3,
+ 0x3B, 0x98, 0x3, 0x3B, 0xBC, 0xC3, 0x3D, 0xB8,
+ 0x1, 0x3A, 0xA4, 0x83, 0x3B, 0xD0, 0xC3, 0x3C,
+ 0xCC, 0x58, 0xDC, 0x80, 0x1C, 0xF0, 0xC1, 0xD,
+ 0xE0, 0x41, 0x1E, 0xCA, 0x61, 0x1C, 0xD2, 0x61,
+ 0x1E, 0xCA, 0x1, 0x0, 0x79, 0x28, 0x0, 0x0, 0x52,
+ 0x0, 0x0, 0x0, 0xC2, 0x3C, 0x90, 0x40, 0x86, 0x10,
+ 0x19, 0x32, 0xE2, 0x64, 0x90, 0x40, 0x46, 0x2,
+ 0x19, 0x23, 0x23, 0x46, 0x2, 0x13, 0x24, 0xC6,
+ 0x0, 0x13, 0x74, 0xCE, 0x61, 0x8C, 0x2D, 0xCC,
+ 0xED, 0xC, 0xC4, 0xAE, 0x4C, 0x6E, 0x2E, 0xED,
+ 0xCD, 0xD, 0x44, 0x46, 0xC6, 0x5, 0xC6, 0x5, 0xE6,
+ 0x2C, 0x8D, 0xE, 0x4, 0xE5, 0x2C, 0x8D, 0xE, 0xE8,
+ 0x2C, 0x8D, 0xE, 0xAD, 0x4E, 0xCC, 0x65, 0xEC,
+ 0xAD, 0x4D, 0x87, 0x8D, 0xCD, 0xAE, 0xED, 0x85,
+ 0x8D, 0xCD, 0xAE, 0xAD, 0x5, 0x4E, 0xEE, 0x4D,
+ 0xAD, 0x6C, 0x8C, 0xCE, 0xE5, 0x2C, 0x8D, 0xE,
+ 0xA4, 0xEC, 0xC6, 0x86, 0xA6, 0x2C, 0x26, 0x7,
+ 0xA6, 0xAC, 0xC, 0x26, 0x26, 0xE7, 0x46, 0x6C,
+ 0x2C, 0xA6, 0xC, 0x66, 0xA6, 0x6C, 0xC6, 0x4C,
+ 0x6, 0x86, 0x6C, 0x4C, 0x6, 0x46, 0xCC, 0x66,
+ 0x6C, 0x2C, 0xC, 0x27, 0x46, 0x6C, 0x86, 0x6C,
+ 0x2C, 0xE5, 0x8, 0x63, 0x73, 0x87, 0x68, 0xB,
+ 0x4B, 0x73, 0x3B, 0xCA, 0xDD, 0x18, 0x5A, 0x98,
+ 0xDC, 0xD7, 0x5C, 0x9A, 0x5E, 0xD9, 0x69, 0xCC,
+ 0xE4, 0xC2, 0xDA, 0xCA, 0x5A, 0xE0, 0xDE, 0xD2,
+ 0xDC, 0xE8, 0xCA, 0xE4, 0x86, 0x20, 0x1C, 0xC1,
+ 0x10, 0x84, 0x43, 0x18, 0x82, 0x70, 0xC, 0x43,
+ 0x10, 0xE, 0x62, 0x8, 0x42, 0x1, 0xC, 0x41, 0x38,
+ 0x8A, 0x21, 0x8, 0x87, 0x31, 0x6, 0xC1, 0x38,
+ 0xC6, 0x10, 0x4, 0x63, 0x18, 0x4, 0x24, 0x19,
+ 0x83, 0x60, 0x24, 0x63, 0x18, 0xC, 0xC3, 0x18,
+ 0x83, 0xB0, 0x44, 0x63, 0x28, 0x94, 0x1, 0x0,
+ 0xA4, 0x31, 0xC, 0x6, 0xB1, 0x8C, 0x61, 0x60,
+ 0xA, 0xC6, 0x24, 0x64, 0x78, 0x2E, 0x76, 0x61,
+ 0x6C, 0x76, 0x65, 0x72, 0x43, 0x9, 0x18, 0xA3,
+ 0xB0, 0xB1, 0xD9, 0xB5, 0xB9, 0xA4, 0x91, 0x95,
+ 0xB9, 0xD1, 0xD, 0x25, 0x68, 0x8C, 0x43, 0x86,
+ 0xE7, 0x32, 0x87, 0x16, 0x46, 0x56, 0x26, 0xD7,
+ 0xF4, 0x46, 0x56, 0xC6, 0x36, 0x94, 0xC0, 0x31,
+ 0xA, 0x19, 0x9E, 0x8B, 0x5D, 0x99, 0xDC, 0x5C,
+ 0xDA, 0x9B, 0xDB, 0x50, 0x82, 0xC7, 0x38, 0x64,
+ 0x78, 0x2E, 0x65, 0x6E, 0x74, 0x72, 0x79, 0x50,
+ 0x6F, 0x69, 0x6E, 0x74, 0x73, 0x43, 0x9, 0x24,
+ 0x13, 0xB1, 0xB1, 0xD9, 0xB5, 0xB9, 0xB4, 0xBD,
+ 0x91, 0xD5, 0xB1, 0x95, 0xB9, 0x98, 0xB1, 0x85,
+ 0x9D, 0xCD, 0xD, 0x45, 0x98, 0x28, 0x0, 0x71,
+ 0x20, 0x0, 0x0, 0x2, 0x0, 0x0, 0x0, 0x6, 0x40,
+ 0x30, 0x0, 0xD2, 0x0, 0x0, 0x0, 0x61, 0x20, 0x0,
+ 0x0, 0x6, 0x0, 0x0, 0x0, 0x13, 0x4, 0x1, 0x86,
+ 0x3, 0x1, 0x0, 0x0, 0x2, 0x0, 0x0, 0x0, 0x7, 0x50,
+ 0x10, 0xCD, 0x14, 0x61, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0 ]
+ - Name: PSV0
+ Size: 76
+ PSVInfo:
+ Version: 3
+ ShaderStage: 5
+ MinimumWaveLaneCount: 0
+ MaximumWaveLaneCount: 4294967295
+ UsesViewID: 0
+ SigInputVectors: 0
+ SigOutputVectors: [ 0, 0, 0, 0 ]
+ NumThreadsX: 1
+ NumThreadsY: 1
+ NumThreadsZ: 1
+ EntryName: main
+ ResourceStride: 24
+ Resources: []
+ SigInputElements: []
+ SigOutputElements: []
+ SigPatchOrPrimElements: []
+ InputOutputMap:
+ - [ ]
+ - [ ]
+ - [ ]
+ - [ ]
+...
diff --git a/llvm/test/tools/llvm-objdump/ELF/Hexagon/packet-reset-on-label.s b/llvm/test/tools/llvm-objdump/ELF/Hexagon/packet-reset-on-label.s
new file mode 100644
index 0000000..02a52bb
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/ELF/Hexagon/packet-reset-on-label.s
@@ -0,0 +1,23 @@
+// RUN: llvm-mc -triple=hexagon -mcpu=hexagonv75 -filetype=obj %s \
+// RUN: | llvm-objdump -d - \
+// RUN: | FileCheck %s
+
+foo:
+ { nop }
+ /// a nop without end-of-packet bits set to simulate data that is
+ /// not a proper packet end.
+ .long 0x7f004000
+bar:
+ { nop
+ nop
+ }
+
+// CHECK-LABEL: <foo>:
+// CHECK: { nop }
+// CHECK-NEXT: { nop
+
+/// The instruction starting after <bar> should start in a new packet.
+// CHECK-LABEL: <bar>:
+// CHECK: { nop
+// CHECK-NEXT: nop }
+
diff --git a/llvm/test/tools/llvm-readobj/ELF/bb-addr-map.test b/llvm/test/tools/llvm-readobj/ELF/bb-addr-map.test
index fd1492f..bcffd40 100644
--- a/llvm/test/tools/llvm-readobj/ELF/bb-addr-map.test
+++ b/llvm/test/tools/llvm-readobj/ELF/bb-addr-map.test
@@ -34,6 +34,7 @@
# CHECK-NEXT: {
# CHECK-NEXT: ID: 0
# CHECK-NEXT: Offset: 0x0
+# CHECK-NEXT: Hash: 0x0
# CHECK-NEXT: Size: 0x1
# CHECK-NEXT: HasReturn: No
# CHECK-NEXT: HasTailCall: Yes
@@ -50,6 +51,7 @@
# CHECK-NEXT: ID: 2
# CHECK-NEXT: Offset: 0x3
# CHECK-NEXT: Callsite End Offsets: [1, 3]
+# CHECK-NEXT: Hash: 0x123
# CHECK-NEXT: Size: 0x7
# CHECK-NEXT: HasReturn: Yes
# CHECK-NEXT: HasTailCall: No
@@ -144,8 +146,8 @@ Sections:
ShSize: [[SIZE=<none>]]
Link: .text
Entries:
- - Version: 3
- Feature: 0x28
+ - Version: 4
+ Feature: 0x68
BBRanges:
- BaseAddress: [[ADDR=0x11111]]
BBEntries:
@@ -160,6 +162,7 @@ Sections:
Size: 0x4
Metadata: 0x15
CallsiteEndOffsets: [ 0x1 , 0x2 ]
+ Hash: 0x123
- Version: 2
BBRanges:
- BaseAddress: 0x22222
diff --git a/llvm/test/tools/obj2yaml/ELF/bb-addr-map.yaml b/llvm/test/tools/obj2yaml/ELF/bb-addr-map.yaml
index dc14025..7a22efe 100644
--- a/llvm/test/tools/obj2yaml/ELF/bb-addr-map.yaml
+++ b/llvm/test/tools/obj2yaml/ELF/bb-addr-map.yaml
@@ -162,6 +162,92 @@ Sections:
BBRanges:
- BaseAddress: 0x20
+## Check that obj2yaml can dump basic block hash in the .llvm_bb_addr_map section.
+
+# RUN: yaml2obj --docnum=4 %s -o %t4
+# RUN: obj2yaml %t4 | FileCheck %s --check-prefix=BBHASH
+
+# BBHASH: --- !ELF
+# BBHASH-NEXT: FileHeader:
+# BBHASH-NEXT: Class: ELFCLASS64
+# BBHASH-NEXT: Data: ELFDATA2LSB
+# BBHASH-NEXT: Type: ET_EXEC
+# BBHASH-NEXT: Sections:
+# BBHASH-NEXT: - Name: .llvm_bb_addr_map
+# BBHASH-NEXT: Type: SHT_LLVM_BB_ADDR_MAP
+# BBHASH-NEXT: Entries:
+# BBHASH-NEXT: - Version: 4
+# BBHASH-NEXT: Feature: 0x40
+# BBHASH-NEXT: BBRanges:
+# BBHASH-NEXT: - BBEntries:
+# BBHASH-NEXT: - ID: 0
+# BBHASH-NEXT: AddressOffset: 0x1
+# BBHASH-NEXT: Size: 0x2
+# BBHASH-NEXT: Metadata: 0x3
+# BBHASH-NEXT: Hash: 0x1
+# BBHASH-NEXT: - ID: 2
+# BBHASH-NEXT: AddressOffset: 0x4
+# BBHASH-NEXT: Size: 0x5
+# BBHASH-NEXT: Metadata: 0x6
+# BBHASH-NEXT: Hash: 0x2
+# BBHASH-NEXT: - ID: 4
+# BBHASH-NEXT: AddressOffset: 0xFFFFFFFFFFFFFFF7
+# BBHASH-NEXT: Size: 0xFFFFFFFFFFFFFFF8
+# BBHASH-NEXT: Metadata: 0xFFFFFFFFFFFFFFF9
+# BBHASH-NEXT: Hash: 0x3
+# BBHASH-NEXT: - Version: 4
+# BBHASH-NEXT: Feature: 0x68
+# BBHASH-NEXT: BBRanges:
+# BBHASH-NEXT: - BaseAddress: 0xFFFFFFFFFFFFFF20
+# BBHASH-NEXT: BBEntries:
+# BBHASH-NEXT: - ID: 6
+# BBHASH-NEXT: AddressOffset: 0xA
+# BBHASH-NEXT: Size: 0xB
+# BBHASH-NEXT: Metadata: 0xC
+# BBHASH-NEXT: CallsiteEndOffsets: [ 0x1, 0x2 ]
+# BBHASH-NEXT: Hash: 0x123
+
+--- !ELF
+FileHeader:
+ Class: ELFCLASS64
+ Data: ELFDATA2LSB
+ Type: ET_EXEC
+Sections:
+ - Name: .llvm_bb_addr_map
+ Type: SHT_LLVM_BB_ADDR_MAP
+ Entries:
+ - Version: 4
+ Feature: 0x40
+ BBRanges:
+ - BaseAddress: 0x0
+ BBEntries:
+ - ID: 0
+ AddressOffset: 0x1
+ Size: 0x2
+ Metadata: 0x3
+ Hash: 0x1
+ - ID: 2
+ AddressOffset: 0x4
+ Size: 0x5
+ Metadata: 0x6
+ Hash: 0x2
+ - ID: 4
+ AddressOffset: 0xFFFFFFFFFFFFFFF7
+ Size: 0xFFFFFFFFFFFFFFF8
+ Metadata: 0xFFFFFFFFFFFFFFF9
+ Hash: 0x3
+ - Version: 4
+ Feature: 0x68
+ BBRanges:
+ - BaseAddress: 0xFFFFFFFFFFFFFF20
+ BBEntries:
+ - ID: 6
+ AddressOffset: 0xA
+ Size: 0xB
+ Metadata: 0xC
+ CallsiteEndOffsets: [ 0x1, 0x2 ]
+ Hash: 0x123
+
## Check that obj2yaml uses the "Content" tag to describe an .llvm_bb_addr_map section
## when it can't extract the entries, for example, when the section is truncated, or
## when an invalid 'NumBlocks' or 'NumBBRanges` field is specified.
diff --git a/llvm/test/tools/yaml2obj/ELF/bb-addr-map.yaml b/llvm/test/tools/yaml2obj/ELF/bb-addr-map.yaml
index 418f90f..339e419 100644
--- a/llvm/test/tools/yaml2obj/ELF/bb-addr-map.yaml
+++ b/llvm/test/tools/yaml2obj/ELF/bb-addr-map.yaml
@@ -72,6 +72,13 @@
# CHECK-NEXT: 0000: 03202000 00000000 0000010E 01000203
# CHECK-NEXT: )
+# Case 10: Specify basic block hash.
+# CHECK: Name: .llvm_bb_addr_map (1)
+# CHECK: SectionData (
+# CHECK-NEXT: 0000: 04602000 00000000 0000010E 01000203
+# CHECK-NEXT: 0010: 23010000 00000000
+# CHECK-NEXT: )
+
--- !ELF
FileHeader:
@@ -176,6 +183,22 @@ Sections:
Metadata: 0x00000003
CallsiteEndOffsets: []
+## 10) We can produce a SHT_LLVM_BB_ADDR_MAP section with basic block hash.
+ - Name: '.llvm_bb_addr_map (10)'
+ Type: SHT_LLVM_BB_ADDR_MAP
+ Entries:
+ - Version: 4
+ Feature: 0x60
+ BBRanges:
+ - BaseAddress: 0x0000000000000020
+ BBEntries:
+ - ID: 14
+ AddressOffset: 0x00000001
+ Size: 0x00000002
+ Metadata: 0x00000003
+ CallsiteEndOffsets: []
+ Hash: 0x123
+
## Check we can't use Entries at the same time as either Content or Size.
# RUN: not yaml2obj --docnum=2 -DCONTENT="00" %s 2>&1 | FileCheck %s --check-prefix=INVALID
# RUN: not yaml2obj --docnum=2 -DSIZE="0" %s 2>&1 | FileCheck %s --check-prefix=INVALID
@@ -197,7 +220,7 @@ Sections:
## Check that yaml2obj generates a warning when we use unsupported versions.
# RUN: yaml2obj --docnum=3 %s 2>&1 | FileCheck %s --check-prefix=INVALID-VERSION
-# INVALID-VERSION: warning: unsupported SHT_LLVM_BB_ADDR_MAP version: 4; encoding using the most recent version
+# INVALID-VERSION: warning: unsupported SHT_LLVM_BB_ADDR_MAP version: 5; encoding using the most recent version
--- !ELF
FileHeader:
@@ -209,4 +232,4 @@ Sections:
Type: SHT_LLVM_BB_ADDR_MAP
Entries:
## Specify unsupported version
- - Version: 4
+ - Version: 5
diff --git a/llvm/tools/bugpoint/BugDriver.cpp b/llvm/tools/bugpoint/BugDriver.cpp
index 2bdfebe..a7e93f6 100644
--- a/llvm/tools/bugpoint/BugDriver.cpp
+++ b/llvm/tools/bugpoint/BugDriver.cpp
@@ -27,9 +27,7 @@
#include <memory>
using namespace llvm;
-namespace llvm {
-Triple TargetTriple;
-}
+Triple llvm::TargetTriple;
DiscardTemp::~DiscardTemp() {
if (SaveTemps) {
@@ -41,18 +39,14 @@ DiscardTemp::~DiscardTemp() {
errs() << "Failed to delete temp file " << toString(std::move(E)) << '\n';
}
-// Anonymous namespace to define command line options for debugging.
-//
-namespace {
// Output - The user can specify a file containing the expected output of the
// program. If this filename is set, it is used as the reference diff source,
// otherwise the raw input run through an interpreter is used as the reference
// source.
//
-cl::opt<std::string> OutputFile("output",
- cl::desc("Specify a reference program output "
- "(for miscompilation detection)"));
-}
+static cl::opt<std::string>
+ OutputFile("output", cl::desc("Specify a reference program output "
+ "(for miscompilation detection)"));
/// If we reduce or update the program somehow, call this method to update
/// bugdriver with it. This deletes the old module and sets the specified one
@@ -238,7 +232,7 @@ Error BugDriver::run() {
return Error::success();
}
-void llvm::PrintFunctionList(const std::vector<Function *> &Funcs) {
+void llvm::printFunctionList(const std::vector<Function *> &Funcs) {
unsigned NumPrint = Funcs.size();
if (NumPrint > 10)
NumPrint = 10;
@@ -249,7 +243,7 @@ void llvm::PrintFunctionList(const std::vector<Function *> &Funcs) {
outs().flush();
}
-void llvm::PrintGlobalVariableList(const std::vector<GlobalVariable *> &GVs) {
+void llvm::printGlobalVariableList(const std::vector<GlobalVariable *> &GVs) {
unsigned NumPrint = GVs.size();
if (NumPrint > 10)
NumPrint = 10;
diff --git a/llvm/tools/bugpoint/BugDriver.h b/llvm/tools/bugpoint/BugDriver.h
index e3117ec..ca57405 100644
--- a/llvm/tools/bugpoint/BugDriver.h
+++ b/llvm/tools/bugpoint/BugDriver.h
@@ -57,7 +57,6 @@ class BugDriver {
// FIXME: sort out public/private distinctions...
friend class ReducePassList;
- friend class ReduceMisCodegenFunctions;
public:
BugDriver(const char *toolname, bool find_bugs, unsigned timeout,
@@ -76,7 +75,7 @@ public:
void setPassesToRun(const std::vector<std::string> &PTR) {
PassesToRun = PTR;
}
- const std::vector<std::string> &getPassesToRun() const { return PassesToRun; }
+ ArrayRef<std::string> getPassesToRun() const { return PassesToRun; }
/// run - The top level method that is invoked after all of the instance
/// variables are set up from command line arguments. The \p as_child argument
@@ -111,7 +110,6 @@ public:
Error debugCodeGenerator();
/// isExecutingJIT - Returns true if bugpoint is currently testing the JIT
- ///
bool isExecutingJIT();
Module &getProgram() const { return *Program; }
@@ -167,7 +165,7 @@ public:
bool RemoveBitcode = false) const;
/// This function is used to output M to a file named "bugpoint-ID.bc".
- void EmitProgressBitcode(const Module &M, const std::string &ID,
+ void emitProgressBitcode(const Module &M, const std::string &ID,
bool NoFlyer = false) const;
/// This method clones the current Program and deletes the specified
@@ -214,7 +212,6 @@ public:
/// outs() a single line message indicating whether compilation was successful
/// or failed, unless Quiet is set. ExtraArgs specifies additional arguments
/// to pass to the child bugpoint instance.
- ///
bool runPasses(Module &Program, const std::vector<std::string> &PassesToRun,
std::string &OutputFilename, bool DeleteOutput = false,
bool Quiet = false,
@@ -223,7 +220,6 @@ public:
/// runPasses - Just like the method above, but this just returns true or
/// false indicating whether or not the optimizer crashed on the specified
/// input (true = crashed). Does not produce any output.
- ///
bool runPasses(Module &M, const std::vector<std::string> &PassesToRun) const {
std::string Filename;
return runPasses(M, PassesToRun, Filename, true);
@@ -247,7 +243,6 @@ public:
private:
/// initializeExecutionEnvironment - This method is used to set up the
/// environment for executing LLVM programs.
- ///
Error initializeExecutionEnvironment();
};
@@ -258,37 +253,31 @@ struct DiscardTemp {
/// Given a bitcode or assembly input filename, parse and return it, or return
/// null if not possible.
-///
std::unique_ptr<Module> parseInputFile(StringRef InputFilename,
LLVMContext &ctxt);
/// getPassesString - Turn a list of passes into a string which indicates the
/// command line options that must be passed to add the passes.
-///
std::string getPassesString(const std::vector<std::string> &Passes);
-/// PrintFunctionList - prints out list of problematic functions
-///
-void PrintFunctionList(const std::vector<Function *> &Funcs);
+/// Prints out list of problematic functions
+void printFunctionList(const std::vector<Function *> &Funcs);
-/// PrintGlobalVariableList - prints out list of problematic global variables
-///
-void PrintGlobalVariableList(const std::vector<GlobalVariable *> &GVs);
+/// Prints out list of problematic global variables
+void printGlobalVariableList(const std::vector<GlobalVariable *> &GVs);
-// DeleteGlobalInitializer - "Remove" the global variable by deleting its
-// initializer, making it external.
-//
-void DeleteGlobalInitializer(GlobalVariable *GV);
+/// "Remove" the global variable by deleting its initializer, making it
+/// external.
+void deleteGlobalInitializer(GlobalVariable *GV);
-// DeleteFunctionBody - "Remove" the function by deleting all of it's basic
-// blocks, making it external.
-//
-void DeleteFunctionBody(Function *F);
+/// "Remove" the function by deleting all of it's basic blocks, making it
+/// external.
+void deleteFunctionBody(Function *F);
/// Given a module and a list of functions in the module, split the functions
/// OUT of the specified module, and place them in the new module.
std::unique_ptr<Module>
-SplitFunctionsOutOfModule(Module *M, const std::vector<Function *> &F,
+splitFunctionsOutOfModule(Module *M, const std::vector<Function *> &F,
ValueToValueMapTy &VMap);
} // End llvm namespace
diff --git a/llvm/tools/bugpoint/CrashDebugger.cpp b/llvm/tools/bugpoint/CrashDebugger.cpp
index fcac014..240300b 100644
--- a/llvm/tools/bugpoint/CrashDebugger.cpp
+++ b/llvm/tools/bugpoint/CrashDebugger.cpp
@@ -36,39 +36,44 @@
#include <set>
using namespace llvm;
-namespace {
-cl::opt<bool> KeepMain("keep-main",
- cl::desc("Force function reduction to keep main"),
- cl::init(false));
-cl::opt<bool> NoGlobalRM("disable-global-remove",
- cl::desc("Do not remove global variables"),
- cl::init(false));
-
-cl::opt<bool> NoAttributeRM("disable-attribute-remove",
- cl::desc("Do not remove function attributes"),
- cl::init(false));
-
-cl::opt<bool> ReplaceFuncsWithNull(
+static cl::opt<bool> KeepMain("keep-main",
+ cl::desc("Force function reduction to keep main"),
+ cl::init(false));
+static cl::opt<bool> NoGlobalRM("disable-global-remove",
+ cl::desc("Do not remove global variables"),
+ cl::init(false));
+
+static cl::opt<bool>
+ NoAttributeRM("disable-attribute-remove",
+ cl::desc("Do not remove function attributes"),
+ cl::init(false));
+
+static cl::opt<bool> ReplaceFuncsWithNull(
"replace-funcs-with-null",
cl::desc("When stubbing functions, replace all uses will null"),
cl::init(false));
-cl::opt<bool> DontReducePassList("disable-pass-list-reduction",
- cl::desc("Skip pass list reduction steps"),
- cl::init(false));
-
-cl::opt<bool> NoNamedMDRM("disable-namedmd-remove",
- cl::desc("Do not remove global named metadata"),
- cl::init(false));
-cl::opt<bool> NoStripDebugInfo("disable-strip-debuginfo",
- cl::desc("Do not strip debug info metadata"),
- cl::init(false));
-cl::opt<bool> NoStripDebugTypeInfo("disable-strip-debug-types",
- cl::desc("Do not strip debug type info metadata"),
- cl::init(false));
-cl::opt<bool> VerboseErrors("verbose-errors",
- cl::desc("Print the output of crashing program"),
- cl::init(false));
-}
+
+static cl::opt<bool>
+ DontReducePassList("disable-pass-list-reduction",
+ cl::desc("Skip pass list reduction steps"),
+ cl::init(false));
+
+static cl::opt<bool>
+ NoNamedMDRM("disable-namedmd-remove",
+ cl::desc("Do not remove global named metadata"),
+ cl::init(false));
+static cl::opt<bool>
+ NoStripDebugInfo("disable-strip-debuginfo",
+ cl::desc("Do not strip debug info metadata"),
+ cl::init(false));
+static cl::opt<bool>
+ NoStripDebugTypeInfo("disable-strip-debug-types",
+ cl::desc("Do not strip debug type info metadata"),
+ cl::init(false));
+static cl::opt<bool>
+ VerboseErrors("verbose-errors",
+ cl::desc("Print the output of crashing program"),
+ cl::init(false));
static bool isValidModule(std::unique_ptr<Module> &M,
bool ExitOnFailure = true) {
@@ -83,6 +88,8 @@ static bool isValidModule(std::unique_ptr<Module> &M,
}
namespace llvm {
+// Note this class needs to be in llvm namespace since its declared as a friend
+// of BugDriver.
class ReducePassList : public ListReducer<std::string> {
BugDriver &BD;
@@ -95,7 +102,7 @@ public:
Expected<TestResult> doTest(std::vector<std::string> &Removed,
std::vector<std::string> &Kept) override;
};
-}
+} // namespace llvm
Expected<ReducePassList::TestResult>
ReducePassList::doTest(std::vector<std::string> &Prefix,
@@ -156,7 +163,7 @@ public:
bool TestGlobalVariables(std::vector<GlobalVariable *> &GVs);
};
-}
+} // namespace
bool ReduceCrashingGlobalInitializers::TestGlobalVariables(
std::vector<GlobalVariable *> &GVs) {
@@ -174,14 +181,14 @@ bool ReduceCrashingGlobalInitializers::TestGlobalVariables(
}
outs() << "Checking for crash with only these global variables: ";
- PrintGlobalVariableList(GVs);
+ printGlobalVariableList(GVs);
outs() << ": ";
// Loop over and delete any global variables which we aren't supposed to be
// playing with...
for (GlobalVariable &I : M->globals())
if (I.hasInitializer() && !GVSet.count(&I)) {
- DeleteGlobalInitializer(&I);
+ deleteGlobalInitializer(&I);
I.setLinkage(GlobalValue::ExternalLinkage);
I.setComdat(nullptr);
}
@@ -223,7 +230,7 @@ public:
bool TestFuncs(std::vector<Function *> &Prefix);
};
-}
+} // namespace
static void RemoveFunctionReferences(Module *M, const char *Name) {
auto *UsedVar = M->getGlobalVariable(Name, true);
@@ -269,14 +276,14 @@ bool ReduceCrashingFunctions::TestFuncs(std::vector<Function *> &Funcs) {
}
outs() << "Checking for crash with only these functions: ";
- PrintFunctionList(Funcs);
+ printFunctionList(Funcs);
outs() << ": ";
if (!ReplaceFuncsWithNull) {
// Loop over and delete any functions which we aren't supposed to be playing
// with...
for (Function &I : *M)
if (!I.isDeclaration() && !Functions.count(&I))
- DeleteFunctionBody(&I);
+ deleteFunctionBody(&I);
} else {
std::vector<GlobalValue *> ToRemove;
// First, remove aliases to functions we're about to purge.
@@ -356,7 +363,7 @@ public:
bool TestFuncAttrs(std::vector<Attribute> &Attrs);
};
-}
+} // namespace
bool ReduceCrashingFunctionAttributes::TestFuncAttrs(
std::vector<Attribute> &Attrs) {
@@ -396,12 +403,11 @@ bool ReduceCrashingFunctionAttributes::TestFuncAttrs(
return false;
}
-namespace {
/// Simplify the CFG without completely destroying it.
/// This is not well defined, but basically comes down to "try to eliminate
/// unreachable blocks and constant fold terminators without deciding that
/// certain undefined behavior cuts off the program at the legs".
-void simpleSimplifyCfg(Function &F, SmallVectorImpl<BasicBlock *> &BBs) {
+static void simpleSimplifyCfg(Function &F, SmallVectorImpl<BasicBlock *> &BBs) {
if (F.empty())
return;
@@ -435,6 +441,8 @@ void simpleSimplifyCfg(Function &F, SmallVectorImpl<BasicBlock *> &BBs) {
for (auto *BB : Unreachable)
BB->eraseFromParent();
}
+
+namespace {
/// ReduceCrashingBlocks reducer - This works by setting the terminators of
/// all terminators except the specified basic blocks to a 'ret' instruction,
/// then running the simplifycfg pass. This has the effect of chopping up
@@ -459,7 +467,7 @@ public:
bool TestBlocks(std::vector<const BasicBlock *> &Prefix);
};
-}
+} // namespace
bool ReduceCrashingBlocks::TestBlocks(std::vector<const BasicBlock *> &BBs) {
// Clone the program to try hacking it apart...
@@ -571,7 +579,7 @@ public:
bool TestBlocks(std::vector<const BasicBlock *> &Prefix);
};
-}
+} // namespace
bool ReduceCrashingConditionals::TestBlocks(
std::vector<const BasicBlock *> &BBs) {
@@ -670,7 +678,7 @@ public:
bool TestBlocks(std::vector<const BasicBlock *> &Prefix);
};
-}
+} // namespace
bool ReduceSimplifyCFG::TestBlocks(std::vector<const BasicBlock *> &BBs) {
// Clone the program to try hacking it apart...
@@ -755,7 +763,7 @@ public:
bool TestInsts(std::vector<const Instruction *> &Prefix);
};
-}
+} // namespace
bool ReduceCrashingInstructions::TestInsts(
std::vector<const Instruction *> &Insts) {
@@ -896,7 +904,7 @@ public:
bool TestNamedMDs(std::vector<std::string> &NamedMDs);
};
-}
+} // namespace
bool ReduceCrashingNamedMD::TestNamedMDs(std::vector<std::string> &NamedMDs) {
@@ -959,7 +967,7 @@ public:
bool TestNamedMDOps(std::vector<const MDNode *> &NamedMDOps);
};
-}
+} // namespace
bool ReduceCrashingNamedMDOps::TestNamedMDOps(
std::vector<const MDNode *> &NamedMDOps) {
@@ -1018,7 +1026,7 @@ static Error ReduceGlobalInitializers(BugDriver &BD, BugTester TestFn) {
for (GlobalVariable &GV : M->globals()) {
if (GV.hasInitializer()) {
- DeleteGlobalInitializer(&GV);
+ deleteGlobalInitializer(&GV);
GV.setLinkage(GlobalValue::ExternalLinkage);
GV.setComdat(nullptr);
DeletedInit = true;
@@ -1056,7 +1064,7 @@ static Error ReduceGlobalInitializers(BugDriver &BD, BugTester TestFn) {
return E;
if (GVs.size() < OldSize)
- BD.EmitProgressBitcode(BD.getProgram(), "reduced-global-variables");
+ BD.emitProgressBitcode(BD.getProgram(), "reduced-global-variables");
}
return Error::success();
}
@@ -1155,7 +1163,7 @@ static Error ReduceInsts(BugDriver &BD, BugTester TestFn) {
return E;
}
- BD.EmitProgressBitcode(BD.getProgram(), "reduced-instructions");
+ BD.emitProgressBitcode(BD.getProgram(), "reduced-instructions");
return Error::success();
}
@@ -1186,7 +1194,7 @@ static Error DebugACrash(BugDriver &BD, BugTester TestFn) {
return E;
if (Functions.size() < OldSize)
- BD.EmitProgressBitcode(BD.getProgram(), "reduced-function");
+ BD.emitProgressBitcode(BD.getProgram(), "reduced-function");
}
if (!NoAttributeRM) {
@@ -1218,7 +1226,7 @@ static Error DebugACrash(BugDriver &BD, BugTester TestFn) {
}
if (OldSize < NewSize)
- BD.EmitProgressBitcode(BD.getProgram(), "reduced-function-attributes");
+ BD.emitProgressBitcode(BD.getProgram(), "reduced-function-attributes");
}
}
@@ -1238,7 +1246,7 @@ static Error DebugACrash(BugDriver &BD, BugTester TestFn) {
if (Error E = Result.takeError())
return E;
if (Blocks.size() < OldSize)
- BD.EmitProgressBitcode(BD.getProgram(), "reduced-conditionals");
+ BD.emitProgressBitcode(BD.getProgram(), "reduced-conditionals");
}
// Attempt to delete entire basic blocks at a time to speed up
@@ -1256,7 +1264,7 @@ static Error DebugACrash(BugDriver &BD, BugTester TestFn) {
if (Error E = Result.takeError())
return E;
if (Blocks.size() < OldSize)
- BD.EmitProgressBitcode(BD.getProgram(), "reduced-blocks");
+ BD.emitProgressBitcode(BD.getProgram(), "reduced-blocks");
}
if (!DisableSimplifyCFG && !BugpointIsInterrupted) {
@@ -1269,7 +1277,7 @@ static Error DebugACrash(BugDriver &BD, BugTester TestFn) {
if (Error E = Result.takeError())
return E;
if (Blocks.size() < OldSize)
- BD.EmitProgressBitcode(BD.getProgram(), "reduced-simplifycfg");
+ BD.emitProgressBitcode(BD.getProgram(), "reduced-simplifycfg");
}
// Attempt to delete instructions using bisection. This should help out nasty
@@ -1319,7 +1327,7 @@ static Error DebugACrash(BugDriver &BD, BugTester TestFn) {
if (Error E = Result.takeError())
return E;
}
- BD.EmitProgressBitcode(BD.getProgram(), "reduced-named-md");
+ BD.emitProgressBitcode(BD.getProgram(), "reduced-named-md");
}
// Try to clean up the testcase by running funcresolve and globaldce...
@@ -1334,7 +1342,7 @@ static Error DebugACrash(BugDriver &BD, BugTester TestFn) {
std::move(M)); // Yup, it does, keep the reduced version...
}
- BD.EmitProgressBitcode(BD.getProgram(), "reduced-simplified");
+ BD.emitProgressBitcode(BD.getProgram(), "reduced-simplified");
return Error::success();
}
@@ -1361,7 +1369,7 @@ Error BugDriver::debugOptimizerCrash(const std::string &ID) {
<< (PassesToRun.size() == 1 ? ": " : "es: ")
<< getPassesString(PassesToRun) << '\n';
- EmitProgressBitcode(*Program, ID);
+ emitProgressBitcode(*Program, ID);
auto Res = DebugACrash(*this, TestForOptimizerCrash);
if (Res || DontReducePassList)
@@ -1376,7 +1384,7 @@ Error BugDriver::debugOptimizerCrash(const std::string &ID) {
<< (PassesToRun.size() == 1 ? ": " : "es: ")
<< getPassesString(PassesToRun) << '\n';
- EmitProgressBitcode(getProgram(), "reduced-simplified");
+ emitProgressBitcode(getProgram(), "reduced-simplified");
return Res;
}
diff --git a/llvm/tools/bugpoint/ExecutionDriver.cpp b/llvm/tools/bugpoint/ExecutionDriver.cpp
index 165b55f..8c6b7fb 100644
--- a/llvm/tools/bugpoint/ExecutionDriver.cpp
+++ b/llvm/tools/bugpoint/ExecutionDriver.cpp
@@ -36,15 +36,16 @@ enum OutputType {
CompileCustom,
Custom
};
+} // namespace
-cl::opt<double> AbsTolerance("abs-tolerance",
- cl::desc("Absolute error tolerated"),
- cl::init(0.0));
-cl::opt<double> RelTolerance("rel-tolerance",
- cl::desc("Relative error tolerated"),
- cl::init(0.0));
+static cl::opt<double> AbsTolerance("abs-tolerance",
+ cl::desc("Absolute error tolerated"),
+ cl::init(0.0));
+static cl::opt<double> RelTolerance("rel-tolerance",
+ cl::desc("Relative error tolerated"),
+ cl::init(0.0));
-cl::opt<OutputType> InterpreterSel(
+static cl::opt<OutputType> InterpreterSel(
cl::desc("Specify the \"test\" i.e. suspect back-end:"),
cl::values(clEnumValN(AutoPick, "auto", "Use best guess"),
clEnumValN(RunLLI, "run-int", "Execute with the interpreter"),
@@ -60,7 +61,7 @@ cl::opt<OutputType> InterpreterSel(
"the bitcode. Useful for cross-compilation.")),
cl::init(AutoPick));
-cl::opt<OutputType> SafeInterpreterSel(
+static cl::opt<OutputType> SafeInterpreterSel(
cl::desc("Specify \"safe\" i.e. known-good backend:"),
cl::values(clEnumValN(AutoPick, "safe-auto", "Use best guess"),
clEnumValN(RunLLC, "safe-run-llc", "Compile with LLC"),
@@ -69,16 +70,16 @@ cl::opt<OutputType> SafeInterpreterSel(
"the bitcode. Useful for cross-compilation.")),
cl::init(AutoPick));
-cl::opt<std::string> SafeInterpreterPath(
+static cl::opt<std::string> SafeInterpreterPath(
"safe-path", cl::desc("Specify the path to the \"safe\" backend program"),
cl::init(""));
-cl::opt<bool> AppendProgramExitCode(
+static cl::opt<bool> AppendProgramExitCode(
"append-exit-code",
cl::desc("Append the exit code to the output so it gets diff'd too"),
cl::init(false));
-cl::opt<std::string>
+static cl::opt<std::string>
InputFile("input", cl::init("/dev/null"),
cl::desc("Filename to pipe in as stdin (default: /dev/null)"));
@@ -89,20 +90,19 @@ static cl::list<std::string>
static cl::list<std::string> AdditionalLinkerArgs(
"Xlinker", cl::desc("Additional arguments to pass to the linker"));
-cl::opt<std::string> CustomCompileCommand(
+static cl::opt<std::string> CustomCompileCommand(
"compile-command", cl::init("llc"),
cl::desc("Command to compile the bitcode (use with -compile-custom) "
"(default: llc)"));
-cl::opt<std::string> CustomExecCommand(
+static cl::opt<std::string> CustomExecCommand(
"exec-command", cl::init("simulate"),
cl::desc("Command to execute the bitcode (use with -run-custom) "
"(default: simulate)"));
-}
-namespace llvm {
// Anything specified after the --args option are taken as arguments to the
// program being debugged.
+namespace llvm {
cl::list<std::string> InputArgv("args", cl::Positional,
cl::desc("<program arguments>..."),
cl::PositionalEatsArgs);
@@ -110,25 +110,22 @@ cl::list<std::string> InputArgv("args", cl::Positional,
cl::opt<std::string>
OutputPrefix("output-prefix", cl::init("bugpoint"),
cl::desc("Prefix to use for outputs (default: 'bugpoint')"));
-}
-
-namespace {
-cl::list<std::string> ToolArgv("tool-args", cl::Positional,
- cl::desc("<tool arguments>..."),
- cl::PositionalEatsArgs);
+} // namespace llvm
-cl::list<std::string> SafeToolArgv("safe-tool-args", cl::Positional,
- cl::desc("<safe-tool arguments>..."),
- cl::PositionalEatsArgs);
+static cl::list<std::string> ToolArgv("tool-args", cl::Positional,
+ cl::desc("<tool arguments>..."),
+ cl::PositionalEatsArgs);
-cl::opt<std::string> CCBinary("gcc", cl::init(""),
- cl::desc("The gcc binary to use."));
+static cl::list<std::string> SafeToolArgv("safe-tool-args", cl::Positional,
+ cl::desc("<safe-tool arguments>..."),
+ cl::PositionalEatsArgs);
-cl::list<std::string> CCToolArgv("gcc-tool-args", cl::Positional,
- cl::desc("<gcc-tool arguments>..."),
- cl::PositionalEatsArgs);
-}
+static cl::opt<std::string> CCBinary("gcc", cl::init(""),
+ cl::desc("The gcc binary to use."));
+static cl::list<std::string> CCToolArgv("gcc-tool-args", cl::Positional,
+ cl::desc("<gcc-tool arguments>..."),
+ cl::PositionalEatsArgs);
//===----------------------------------------------------------------------===//
// BugDriver method implementation
//
diff --git a/llvm/tools/bugpoint/ExtractFunction.cpp b/llvm/tools/bugpoint/ExtractFunction.cpp
index dd9a82c..3206589 100644
--- a/llvm/tools/bugpoint/ExtractFunction.cpp
+++ b/llvm/tools/bugpoint/ExtractFunction.cpp
@@ -35,19 +35,19 @@ using namespace llvm;
#define DEBUG_TYPE "bugpoint"
+bool llvm::DisableSimplifyCFG = false;
namespace llvm {
-bool DisableSimplifyCFG = false;
extern cl::opt<std::string> OutputPrefix;
-} // End llvm namespace
+} // namespace llvm
-namespace {
-cl::opt<bool> NoDCE("disable-dce",
- cl::desc("Do not use the -dce pass to reduce testcases"));
-cl::opt<bool, true>
+static cl::opt<bool>
+ NoDCE("disable-dce",
+ cl::desc("Do not use the -dce pass to reduce testcases"));
+static cl::opt<bool, true>
NoSCFG("disable-simplifycfg", cl::location(DisableSimplifyCFG),
cl::desc("Do not use the -simplifycfg pass to reduce testcases"));
-Function *globalInitUsesExternalBA(GlobalVariable *GV) {
+static Function *globalInitUsesExternalBA(GlobalVariable *GV) {
if (!GV->hasInitializer())
return nullptr;
@@ -78,7 +78,6 @@ Function *globalInitUsesExternalBA(GlobalVariable *GV) {
}
return nullptr;
}
-} // end anonymous namespace
std::unique_ptr<Module>
BugDriver::deleteInstructionFromProgram(const Instruction *I,
@@ -154,7 +153,7 @@ std::unique_ptr<Module> BugDriver::extractLoop(Module *M) {
std::unique_ptr<Module> NewM = runPassesOn(M, LoopExtractPasses);
if (!NewM) {
outs() << "*** Loop extraction failed: ";
- EmitProgressBitcode(*M, "loopextraction", true);
+ emitProgressBitcode(*M, "loopextraction", true);
outs() << "*** Sorry. :( Please report a bug!\n";
return nullptr;
}
@@ -198,21 +197,16 @@ static void eliminateAliases(GlobalValue *GV) {
}
}
-//
-// DeleteGlobalInitializer - "Remove" the global variable by deleting its
-// initializer,
-// making it external.
-//
-void llvm::DeleteGlobalInitializer(GlobalVariable *GV) {
+// "Remove" the global variable by deleting its initializer, making it external.
+void llvm::deleteGlobalInitializer(GlobalVariable *GV) {
eliminateAliases(GV);
GV->setInitializer(nullptr);
GV->setComdat(nullptr);
}
-// DeleteFunctionBody - "Remove" the function by deleting all of its basic
-// blocks, making it external.
-//
-void llvm::DeleteFunctionBody(Function *F) {
+// "Remove" the function by deleting all of its basic blocks, making it
+// external.
+void llvm::deleteFunctionBody(Function *F) {
eliminateAliases(F);
// Function declarations can't have comdats.
F->setComdat(nullptr);
@@ -222,9 +216,9 @@ void llvm::DeleteFunctionBody(Function *F) {
assert(F->isDeclaration() && "This didn't make the function external!");
}
-/// GetTorInit - Given a list of entries for static ctors/dtors, return them
+/// getTorInit - Given a list of entries for static ctors/dtors, return them
/// as a constant array.
-static Constant *GetTorInit(std::vector<std::pair<Function *, int>> &TorList) {
+static Constant *getTorInit(std::vector<std::pair<Function *, int>> &TorList) {
assert(!TorList.empty() && "Don't create empty tor list!");
std::vector<Constant *> ArrayElts;
Type *Int32Ty = Type::getInt32Ty(TorList[0].first->getContext());
@@ -239,11 +233,11 @@ static Constant *GetTorInit(std::vector<std::pair<Function *, int>> &TorList) {
ArrayType::get(ArrayElts[0]->getType(), ArrayElts.size()), ArrayElts);
}
-/// SplitStaticCtorDtor - A module was recently split into two parts, M1/M2, and
+/// splitStaticCtorDtor - A module was recently split into two parts, M1/M2, and
/// M1 has all of the global variables. If M2 contains any functions that are
/// static ctors/dtors, we need to add an llvm.global_[cd]tors global to M2, and
/// prune appropriate entries out of M1s list.
-static void SplitStaticCtorDtor(const char *GlobalName, Module *M1, Module *M2,
+static void splitStaticCtorDtor(const char *GlobalName, Module *M1, Module *M2,
ValueToValueMapTy &VMap) {
GlobalVariable *GV = M1->getNamedGlobal(GlobalName);
if (!GV || GV->isDeclaration() || GV->hasLocalLinkage() || !GV->use_empty())
@@ -284,7 +278,7 @@ static void SplitStaticCtorDtor(const char *GlobalName, Module *M1, Module *M2,
GV->eraseFromParent();
if (!M1Tors.empty()) {
- Constant *M1Init = GetTorInit(M1Tors);
+ Constant *M1Init = getTorInit(M1Tors);
new GlobalVariable(*M1, M1Init->getType(), false,
GlobalValue::AppendingLinkage, M1Init, GlobalName);
}
@@ -295,14 +289,14 @@ static void SplitStaticCtorDtor(const char *GlobalName, Module *M1, Module *M2,
GV->eraseFromParent();
if (!M2Tors.empty()) {
- Constant *M2Init = GetTorInit(M2Tors);
+ Constant *M2Init = getTorInit(M2Tors);
new GlobalVariable(*M2, M2Init->getType(), false,
GlobalValue::AppendingLinkage, M2Init, GlobalName);
}
}
std::unique_ptr<Module>
-llvm::SplitFunctionsOutOfModule(Module *M, const std::vector<Function *> &F,
+llvm::splitFunctionsOutOfModule(Module *M, const std::vector<Function *> &F,
ValueToValueMapTy &VMap) {
// Make sure functions & globals are all external so that linkage
// between the two modules will work.
@@ -326,13 +320,13 @@ llvm::SplitFunctionsOutOfModule(Module *M, const std::vector<Function *> &F,
LLVM_DEBUG(TNOF->printAsOperand(errs(), false));
LLVM_DEBUG(errs() << "\n");
TestFunctions.insert(cast<Function>(NewVMap[TNOF]));
- DeleteFunctionBody(TNOF); // Function is now external in this module!
+ deleteFunctionBody(TNOF); // Function is now external in this module!
}
// Remove the Safe functions from the Test module
for (Function &I : *New)
if (!TestFunctions.count(&I))
- DeleteFunctionBody(&I);
+ deleteFunctionBody(&I);
// Try to split the global initializers evenly
for (GlobalVariable &I : M->globals()) {
@@ -348,17 +342,17 @@ llvm::SplitFunctionsOutOfModule(Module *M, const std::vector<Function *> &F,
<< TestFn->getName() << "'.\n";
exit(1);
}
- DeleteGlobalInitializer(&I); // Delete the initializer to make it external
+ deleteGlobalInitializer(&I); // Delete the initializer to make it external
} else {
// If we keep it in the safe module, then delete it in the test module
- DeleteGlobalInitializer(GV);
+ deleteGlobalInitializer(GV);
}
}
// Make sure that there is a global ctor/dtor array in both halves of the
// module if they both have static ctor/dtor functions.
- SplitStaticCtorDtor("llvm.global_ctors", M, New.get(), NewVMap);
- SplitStaticCtorDtor("llvm.global_dtors", M, New.get(), NewVMap);
+ splitStaticCtorDtor("llvm.global_ctors", M, New.get(), NewVMap);
+ splitStaticCtorDtor("llvm.global_dtors", M, New.get(), NewVMap);
return New;
}
@@ -375,7 +369,7 @@ BugDriver::extractMappedBlocksFromModule(const std::vector<BasicBlock *> &BBs,
outs() << "*** Basic Block extraction failed!\n";
errs() << "Error creating temporary file: " << toString(Temp.takeError())
<< "\n";
- EmitProgressBitcode(*M, "basicblockextractfail", true);
+ emitProgressBitcode(*M, "basicblockextractfail", true);
return nullptr;
}
DiscardTemp Discard{*Temp};
@@ -399,7 +393,7 @@ BugDriver::extractMappedBlocksFromModule(const std::vector<BasicBlock *> &BBs,
OS.flush();
if (OS.has_error()) {
errs() << "Error writing list of blocks to not extract\n";
- EmitProgressBitcode(*M, "basicblockextractfail", true);
+ emitProgressBitcode(*M, "basicblockextractfail", true);
OS.clear_error();
return nullptr;
}
@@ -413,7 +407,7 @@ BugDriver::extractMappedBlocksFromModule(const std::vector<BasicBlock *> &BBs,
if (!Ret) {
outs() << "*** Basic Block extraction failed, please report a bug!\n";
- EmitProgressBitcode(*M, "basicblockextractfail", true);
+ emitProgressBitcode(*M, "basicblockextractfail", true);
}
return Ret;
}
diff --git a/llvm/tools/bugpoint/Miscompilation.cpp b/llvm/tools/bugpoint/Miscompilation.cpp
index 4cf7de3..a7f1643 100644
--- a/llvm/tools/bugpoint/Miscompilation.cpp
+++ b/llvm/tools/bugpoint/Miscompilation.cpp
@@ -33,16 +33,16 @@ extern cl::opt<std::string> OutputPrefix;
extern cl::list<std::string> InputArgv;
} // end namespace llvm
-namespace {
-static llvm::cl::opt<bool> DisableLoopExtraction(
+static cl::opt<bool> DisableLoopExtraction(
"disable-loop-extraction",
cl::desc("Don't extract loops when searching for miscompilations"),
cl::init(false));
-static llvm::cl::opt<bool> DisableBlockExtraction(
+static cl::opt<bool> DisableBlockExtraction(
"disable-block-extraction",
cl::desc("Don't extract blocks when searching for miscompilations"),
cl::init(false));
+namespace {
class ReduceMiscompilingPasses : public ListReducer<std::string> {
BugDriver &BD;
@@ -71,7 +71,7 @@ ReduceMiscompilingPasses::doTest(std::vector<std::string> &Prefix,
errs() << " Error running this sequence of passes"
<< " on the input program!\n";
BD.setPassesToRun(Suffix);
- BD.EmitProgressBitcode(BD.getProgram(), "pass-error", false);
+ BD.emitProgressBitcode(BD.getProgram(), "pass-error", false);
// TODO: This should propagate the error instead of exiting.
if (Error E = BD.debugOptimizerCrash())
exit(1);
@@ -113,7 +113,7 @@ ReduceMiscompilingPasses::doTest(std::vector<std::string> &Prefix,
errs() << " Error running this sequence of passes"
<< " on the input program!\n";
BD.setPassesToRun(Prefix);
- BD.EmitProgressBitcode(BD.getProgram(), "pass-error", false);
+ BD.emitProgressBitcode(BD.getProgram(), "pass-error", false);
// TODO: This should propagate the error instead of exiting.
if (Error E = BD.debugOptimizerCrash())
exit(1);
@@ -158,7 +158,7 @@ ReduceMiscompilingPasses::doTest(std::vector<std::string> &Prefix,
errs() << " Error running this sequence of passes"
<< " on the input program!\n";
BD.setPassesToRun(Suffix);
- BD.EmitProgressBitcode(BD.getProgram(), "pass-error", false);
+ BD.emitProgressBitcode(BD.getProgram(), "pass-error", false);
// TODO: This should propagate the error instead of exiting.
if (Error E = BD.debugOptimizerCrash())
exit(1);
@@ -253,7 +253,7 @@ ReduceMiscompilingFunctions::TestFuncs(const std::vector<Function *> &Funcs) {
<< (Funcs.size() == 1 ? "this function is" : "these functions are")
<< " run through the pass"
<< (BD.getPassesToRun().size() == 1 ? "" : "es") << ":";
- PrintFunctionList(Funcs);
+ printFunctionList(Funcs);
outs() << '\n';
// Create a clone for two reasons:
@@ -277,7 +277,7 @@ ReduceMiscompilingFunctions::TestFuncs(const std::vector<Function *> &Funcs) {
VMap.clear();
std::unique_ptr<Module> ToNotOptimize = CloneModule(BD.getProgram(), VMap);
std::unique_ptr<Module> ToOptimize =
- SplitFunctionsOutOfModule(ToNotOptimize.get(), FuncsOnClone, VMap);
+ splitFunctionsOutOfModule(ToNotOptimize.get(), FuncsOnClone, VMap);
Expected<bool> Broken =
TestFn(BD, std::move(ToOptimize), std::move(ToNotOptimize));
@@ -314,7 +314,7 @@ ExtractLoops(BugDriver &BD,
ValueToValueMapTy VMap;
std::unique_ptr<Module> ToNotOptimize = CloneModule(BD.getProgram(), VMap);
- std::unique_ptr<Module> ToOptimize = SplitFunctionsOutOfModule(
+ std::unique_ptr<Module> ToOptimize = splitFunctionsOutOfModule(
ToNotOptimize.get(), MiscompiledFunctions, VMap);
std::unique_ptr<Module> ToOptimizeLoopExtracted =
BD.extractLoop(ToOptimize.get());
@@ -517,7 +517,7 @@ ReduceMiscompiledBlocks::TestFuncs(const std::vector<BasicBlock *> &BBs) {
std::unique_ptr<Module> ToNotOptimize = CloneModule(BD.getProgram(), VMap);
std::unique_ptr<Module> ToOptimize =
- SplitFunctionsOutOfModule(ToNotOptimize.get(), FuncsOnClone, VMap);
+ splitFunctionsOutOfModule(ToNotOptimize.get(), FuncsOnClone, VMap);
// Try the extraction. If it doesn't work, then the block extractor crashed
// or something, in which case bugpoint can't chase down this possibility.
@@ -572,7 +572,7 @@ ExtractBlocks(BugDriver &BD,
ValueToValueMapTy VMap;
std::unique_ptr<Module> ProgClone = CloneModule(BD.getProgram(), VMap);
std::unique_ptr<Module> ToExtract =
- SplitFunctionsOutOfModule(ProgClone.get(), MiscompiledFunctions, VMap);
+ splitFunctionsOutOfModule(ProgClone.get(), MiscompiledFunctions, VMap);
std::unique_ptr<Module> Extracted =
BD.extractMappedBlocksFromModule(Blocks, ToExtract.get());
if (!Extracted) {
@@ -638,7 +638,7 @@ static Expected<std::vector<Function *>> DebugAMiscompilation(
outs() << "\n*** The following function"
<< (MiscompiledFunctions.size() == 1 ? " is" : "s are")
<< " being miscompiled: ";
- PrintFunctionList(MiscompiledFunctions);
+ printFunctionList(MiscompiledFunctions);
outs() << '\n';
// See if we can rip any loops out of the miscompiled functions and still
@@ -663,7 +663,7 @@ static Expected<std::vector<Function *>> DebugAMiscompilation(
outs() << "\n*** The following function"
<< (MiscompiledFunctions.size() == 1 ? " is" : "s are")
<< " being miscompiled: ";
- PrintFunctionList(MiscompiledFunctions);
+ printFunctionList(MiscompiledFunctions);
outs() << '\n';
}
}
@@ -686,7 +686,7 @@ static Expected<std::vector<Function *>> DebugAMiscompilation(
outs() << "\n*** The following function"
<< (MiscompiledFunctions.size() == 1 ? " is" : "s are")
<< " being miscompiled: ";
- PrintFunctionList(MiscompiledFunctions);
+ printFunctionList(MiscompiledFunctions);
outs() << '\n';
}
}
@@ -708,7 +708,7 @@ static Expected<bool> TestOptimizer(BugDriver &BD, std::unique_ptr<Module> Test,
if (!Optimized) {
errs() << " Error running this sequence of passes"
<< " on the input program!\n";
- BD.EmitProgressBitcode(*Test, "pass-error", false);
+ BD.emitProgressBitcode(*Test, "pass-error", false);
BD.setNewProgram(std::move(Test));
if (Error E = BD.debugOptimizerCrash())
return std::move(E);
@@ -750,7 +750,7 @@ Error BugDriver::debugMiscompilation() {
outs() << "\n*** Found miscompiling pass"
<< (getPassesToRun().size() == 1 ? "" : "es") << ": "
<< getPassesString(getPassesToRun()) << '\n';
- EmitProgressBitcode(*Program, "passinput");
+ emitProgressBitcode(*Program, "passinput");
Expected<std::vector<Function *>> MiscompiledFunctions =
DebugAMiscompilation(*this, TestOptimizer);
@@ -762,15 +762,15 @@ Error BugDriver::debugMiscompilation() {
ValueToValueMapTy VMap;
Module *ToNotOptimize = CloneModule(getProgram(), VMap).release();
Module *ToOptimize =
- SplitFunctionsOutOfModule(ToNotOptimize, *MiscompiledFunctions, VMap)
+ splitFunctionsOutOfModule(ToNotOptimize, *MiscompiledFunctions, VMap)
.release();
outs() << " Non-optimized portion: ";
- EmitProgressBitcode(*ToNotOptimize, "tonotoptimize", true);
+ emitProgressBitcode(*ToNotOptimize, "tonotoptimize", true);
delete ToNotOptimize; // Delete hacked module.
outs() << " Portion that is input to optimizer: ";
- EmitProgressBitcode(*ToOptimize, "tooptimize");
+ emitProgressBitcode(*ToOptimize, "tooptimize");
delete ToOptimize; // Delete hacked module.
return Error::success();
@@ -1028,7 +1028,7 @@ Error BugDriver::debugCodeGenerator() {
ValueToValueMapTy VMap;
std::unique_ptr<Module> ToNotCodeGen = CloneModule(getProgram(), VMap);
std::unique_ptr<Module> ToCodeGen =
- SplitFunctionsOutOfModule(ToNotCodeGen.get(), *Funcs, VMap);
+ splitFunctionsOutOfModule(ToNotCodeGen.get(), *Funcs, VMap);
// Condition the modules
ToCodeGen =
diff --git a/llvm/tools/bugpoint/OptimizerDriver.cpp b/llvm/tools/bugpoint/OptimizerDriver.cpp
index 3daacfd..bf2e8c0 100644
--- a/llvm/tools/bugpoint/OptimizerDriver.cpp
+++ b/llvm/tools/bugpoint/OptimizerDriver.cpp
@@ -82,7 +82,7 @@ bool BugDriver::writeProgramToFile(const std::string &Filename,
/// This function is used to output the current Program to a file named
/// "bugpoint-ID.bc".
-void BugDriver::EmitProgressBitcode(const Module &M, const std::string &ID,
+void BugDriver::emitProgressBitcode(const Module &M, const std::string &ID,
bool NoFlyer) const {
// Output the input to the current pass to a bitcode file, emit a message
// telling the user how to reproduce it: opt -foo blah.bc
diff --git a/llvm/tools/bugpoint/ToolRunner.cpp b/llvm/tools/bugpoint/ToolRunner.cpp
index f2f5966a..c67695f 100644
--- a/llvm/tools/bugpoint/ToolRunner.cpp
+++ b/llvm/tools/bugpoint/ToolRunner.cpp
@@ -25,29 +25,25 @@ using namespace llvm;
#define DEBUG_TYPE "toolrunner"
-namespace llvm {
-cl::opt<bool> SaveTemps("save-temps", cl::init(false),
- cl::desc("Save temporary files"));
-}
+cl::opt<bool> llvm::SaveTemps("save-temps", cl::init(false),
+ cl::desc("Save temporary files"));
-namespace {
-cl::opt<std::string>
+static cl::opt<std::string>
RemoteClient("remote-client",
cl::desc("Remote execution client (rsh/ssh)"));
-cl::opt<std::string> RemoteHost("remote-host",
- cl::desc("Remote execution (rsh/ssh) host"));
+static cl::opt<std::string>
+ RemoteHost("remote-host", cl::desc("Remote execution (rsh/ssh) host"));
-cl::opt<std::string> RemotePort("remote-port",
- cl::desc("Remote execution (rsh/ssh) port"));
+static cl::opt<std::string>
+ RemotePort("remote-port", cl::desc("Remote execution (rsh/ssh) port"));
-cl::opt<std::string> RemoteUser("remote-user",
- cl::desc("Remote execution (rsh/ssh) user id"));
+static cl::opt<std::string>
+ RemoteUser("remote-user", cl::desc("Remote execution (rsh/ssh) user id"));
-cl::opt<std::string>
+static cl::opt<std::string>
RemoteExtra("remote-extra-options",
cl::desc("Remote execution (rsh/ssh) extra options"));
-}
/// RunProgramWithTimeout - This function provides an alternate interface
/// to the sys::Program::ExecuteAndWait interface.
@@ -160,7 +156,7 @@ public:
const std::vector<std::string> &SharedLibs = std::vector<std::string>(),
unsigned Timeout = 0, unsigned MemoryLimit = 0) override;
};
-}
+} // namespace
Expected<int> LLI::ExecuteProgram(const std::string &Bitcode,
const std::vector<std::string> &Args,
@@ -258,7 +254,7 @@ public:
inconvertibleErrorCode());
}
};
-}
+} // namespace
Error CustomCompiler::compileProgram(const std::string &Bitcode,
unsigned Timeout, unsigned MemoryLimit) {
@@ -301,7 +297,7 @@ public:
const std::vector<std::string> &SharedLibs = std::vector<std::string>(),
unsigned Timeout = 0, unsigned MemoryLimit = 0) override;
};
-}
+} // namespace
Expected<int> CustomExecutor::ExecuteProgram(
const std::string &Bitcode, const std::vector<std::string> &Args,
@@ -541,7 +537,7 @@ public:
const std::vector<std::string> &SharedLibs = std::vector<std::string>(),
unsigned Timeout = 0, unsigned MemoryLimit = 0) override;
};
-}
+} // namespace
Expected<int> JIT::ExecuteProgram(const std::string &Bitcode,
const std::vector<std::string> &Args,
diff --git a/llvm/tools/bugpoint/bugpoint.cpp b/llvm/tools/bugpoint/bugpoint.cpp
index 87581e80a..52ed135 100644
--- a/llvm/tools/bugpoint/bugpoint.cpp
+++ b/llvm/tools/bugpoint/bugpoint.cpp
@@ -90,7 +90,7 @@ public:
D.addPass(std::string(PI->getPassArgument()));
}
};
-}
+} // namespace
#define HANDLE_EXTENSION(Ext) \
llvm::PassPluginLibraryInfo get##Ext##PluginInfo();
diff --git a/llvm/tools/llc/llc.cpp b/llvm/tools/llc/llc.cpp
index f04b256..8b03db3 100644
--- a/llvm/tools/llc/llc.cpp
+++ b/llvm/tools/llc/llc.cpp
@@ -218,13 +218,12 @@ static cl::opt<std::string> PassPipeline(
static cl::alias PassPipeline2("p", cl::aliasopt(PassPipeline),
cl::desc("Alias for -passes"));
-namespace {
-
-std::vector<std::string> &getRunPassNames() {
+static std::vector<std::string> &getRunPassNames() {
static std::vector<std::string> RunPassNames;
return RunPassNames;
}
+namespace {
struct RunPassOption {
void operator=(const std::string &Val) const {
if (Val.empty())
diff --git a/llvm/tools/lli/lli.cpp b/llvm/tools/lli/lli.cpp
index 875ec1b..7fee06b 100644
--- a/llvm/tools/lli/lli.cpp
+++ b/llvm/tools/lli/lli.cpp
@@ -92,206 +92,202 @@ static codegen::RegisterCodeGenFlags CGF;
#define DEBUG_TYPE "lli"
namespace {
-
- enum class JITKind { MCJIT, Orc, OrcLazy };
- enum class JITLinkerKind { Default, RuntimeDyld, JITLink };
-
- cl::opt<std::string>
- InputFile(cl::desc("<input bitcode>"), cl::Positional, cl::init("-"));
-
- cl::list<std::string>
- InputArgv(cl::ConsumeAfter, cl::desc("<program arguments>..."));
-
- cl::opt<bool> ForceInterpreter("force-interpreter",
- cl::desc("Force interpretation: disable JIT"),
- cl::init(false));
-
- cl::opt<JITKind> UseJITKind(
- "jit-kind", cl::desc("Choose underlying JIT kind."),
- cl::init(JITKind::Orc),
- cl::values(clEnumValN(JITKind::MCJIT, "mcjit", "MCJIT"),
- clEnumValN(JITKind::Orc, "orc", "Orc JIT"),
- clEnumValN(JITKind::OrcLazy, "orc-lazy",
- "Orc-based lazy JIT.")));
-
- cl::opt<JITLinkerKind>
- JITLinker("jit-linker", cl::desc("Choose the dynamic linker/loader."),
- cl::init(JITLinkerKind::Default),
- cl::values(clEnumValN(JITLinkerKind::Default, "default",
- "Default for platform and JIT-kind"),
- clEnumValN(JITLinkerKind::RuntimeDyld, "rtdyld",
- "RuntimeDyld"),
- clEnumValN(JITLinkerKind::JITLink, "jitlink",
- "Orc-specific linker")));
- cl::opt<std::string> OrcRuntime("orc-runtime",
- cl::desc("Use ORC runtime from given path"),
- cl::init(""));
-
- cl::opt<unsigned>
- LazyJITCompileThreads("compile-threads",
- cl::desc("Choose the number of compile threads "
- "(jit-kind=orc-lazy only)"),
- cl::init(0));
-
- cl::list<std::string>
- ThreadEntryPoints("thread-entry",
- cl::desc("calls the given entry-point on a new thread "
- "(jit-kind=orc-lazy only)"));
-
- cl::opt<bool> PerModuleLazy(
- "per-module-lazy",
- cl::desc("Performs lazy compilation on whole module boundaries "
- "rather than individual functions"),
- cl::init(false));
-
- cl::list<std::string>
- JITDylibs("jd",
- cl::desc("Specifies the JITDylib to be used for any subsequent "
- "-extra-module arguments."));
-
- cl::list<std::string>
- Dylibs("dlopen", cl::desc("Dynamic libraries to load before linking"));
-
- // The MCJIT supports building for a target address space separate from
- // the JIT compilation process. Use a forked process and a copying
- // memory manager with IPC to execute using this functionality.
- cl::opt<bool> RemoteMCJIT("remote-mcjit",
- cl::desc("Execute MCJIT'ed code in a separate process."),
+enum class JITKind { MCJIT, Orc, OrcLazy };
+enum class JITLinkerKind { Default, RuntimeDyld, JITLink };
+} // namespace
+
+static cl::opt<std::string> InputFile(cl::desc("<input bitcode>"),
+ cl::Positional, cl::init("-"));
+
+static cl::list<std::string> InputArgv(cl::ConsumeAfter,
+ cl::desc("<program arguments>..."));
+
+static cl::opt<bool>
+ ForceInterpreter("force-interpreter",
+ cl::desc("Force interpretation: disable JIT"),
+ cl::init(false));
+
+static cl::opt<JITKind>
+ UseJITKind("jit-kind", cl::desc("Choose underlying JIT kind."),
+ cl::init(JITKind::Orc),
+ cl::values(clEnumValN(JITKind::MCJIT, "mcjit", "MCJIT"),
+ clEnumValN(JITKind::Orc, "orc", "Orc JIT"),
+ clEnumValN(JITKind::OrcLazy, "orc-lazy",
+ "Orc-based lazy JIT.")));
+
+static cl::opt<JITLinkerKind> JITLinker(
+ "jit-linker", cl::desc("Choose the dynamic linker/loader."),
+ cl::init(JITLinkerKind::Default),
+ cl::values(clEnumValN(JITLinkerKind::Default, "default",
+ "Default for platform and JIT-kind"),
+ clEnumValN(JITLinkerKind::RuntimeDyld, "rtdyld", "RuntimeDyld"),
+ clEnumValN(JITLinkerKind::JITLink, "jitlink",
+ "Orc-specific linker")));
+static cl::opt<std::string>
+ OrcRuntime("orc-runtime", cl::desc("Use ORC runtime from given path"),
+ cl::init(""));
+
+static cl::opt<unsigned>
+ LazyJITCompileThreads("compile-threads",
+ cl::desc("Choose the number of compile threads "
+ "(jit-kind=orc-lazy only)"),
+ cl::init(0));
+
+static cl::list<std::string>
+ ThreadEntryPoints("thread-entry",
+ cl::desc("calls the given entry-point on a new thread "
+ "(jit-kind=orc-lazy only)"));
+
+static cl::opt<bool> PerModuleLazy(
+ "per-module-lazy",
+ cl::desc("Performs lazy compilation on whole module boundaries "
+ "rather than individual functions"),
cl::init(false));
- // Manually specify the child process for remote execution. This overrides
- // the simulated remote execution that allocates address space for child
- // execution. The child process will be executed and will communicate with
- // lli via stdin/stdout pipes.
- cl::opt<std::string>
- ChildExecPath("mcjit-remote-process",
- cl::desc("Specify the filename of the process to launch "
- "for remote MCJIT execution. If none is specified,"
- "\n\tremote execution will be simulated in-process."),
- cl::value_desc("filename"), cl::init(""));
-
- // Determine optimization level.
- cl::opt<char> OptLevel("O",
- cl::desc("Optimization level. [-O0, -O1, -O2, or -O3] "
- "(default = '-O2')"),
- cl::Prefix, cl::init('2'));
-
- cl::opt<std::string>
- TargetTriple("mtriple", cl::desc("Override target triple for module"));
-
- cl::opt<std::string>
- EntryFunc("entry-function",
- cl::desc("Specify the entry function (default = 'main') "
- "of the executable"),
- cl::value_desc("function"),
- cl::init("main"));
-
- cl::list<std::string>
- ExtraModules("extra-module",
- cl::desc("Extra modules to be loaded"),
- cl::value_desc("input bitcode"));
-
- cl::list<std::string>
- ExtraObjects("extra-object",
- cl::desc("Extra object files to be loaded"),
- cl::value_desc("input object"));
-
- cl::list<std::string>
- ExtraArchives("extra-archive",
- cl::desc("Extra archive files to be loaded"),
- cl::value_desc("input archive"));
-
- cl::opt<bool>
- EnableCacheManager("enable-cache-manager",
- cl::desc("Use cache manager to save/load modules"),
- cl::init(false));
-
- cl::opt<std::string>
- ObjectCacheDir("object-cache-dir",
- cl::desc("Directory to store cached object files "
- "(must be user writable)"),
- cl::init(""));
-
- cl::opt<std::string>
- FakeArgv0("fake-argv0",
- cl::desc("Override the 'argv[0]' value passed into the executing"
- " program"), cl::value_desc("executable"));
-
- cl::opt<bool>
- DisableCoreFiles("disable-core-files", cl::Hidden,
- cl::desc("Disable emission of core files if possible"));
-
- cl::opt<bool>
- NoLazyCompilation("disable-lazy-compilation",
- cl::desc("Disable JIT lazy compilation"),
- cl::init(false));
-
- cl::opt<bool>
- GenerateSoftFloatCalls("soft-float",
- cl::desc("Generate software floating point library calls"),
+static cl::list<std::string>
+ JITDylibs("jd",
+ cl::desc("Specifies the JITDylib to be used for any subsequent "
+ "-extra-module arguments."));
+
+static cl::list<std::string>
+ Dylibs("dlopen", cl::desc("Dynamic libraries to load before linking"));
+
+// The MCJIT supports building for a target address space separate from
+// the JIT compilation process. Use a forked process and a copying
+// memory manager with IPC to execute using this functionality.
+static cl::opt<bool>
+ RemoteMCJIT("remote-mcjit",
+ cl::desc("Execute MCJIT'ed code in a separate process."),
+ cl::init(false));
+
+// Manually specify the child process for remote execution. This overrides
+// the simulated remote execution that allocates address space for child
+// execution. The child process will be executed and will communicate with
+// lli via stdin/stdout pipes.
+static cl::opt<std::string> ChildExecPath(
+ "mcjit-remote-process",
+ cl::desc("Specify the filename of the process to launch "
+ "for remote MCJIT execution. If none is specified,"
+ "\n\tremote execution will be simulated in-process."),
+ cl::value_desc("filename"), cl::init(""));
+
+// Determine optimization level.
+static cl::opt<char>
+ OptLevel("O",
+ cl::desc("Optimization level. [-O0, -O1, -O2, or -O3] "
+ "(default = '-O2')"),
+ cl::Prefix, cl::init('2'));
+
+static cl::opt<std::string>
+ TargetTriple("mtriple", cl::desc("Override target triple for module"));
+
+static cl::opt<std::string>
+ EntryFunc("entry-function",
+ cl::desc("Specify the entry function (default = 'main') "
+ "of the executable"),
+ cl::value_desc("function"), cl::init("main"));
+
+static cl::list<std::string>
+ ExtraModules("extra-module", cl::desc("Extra modules to be loaded"),
+ cl::value_desc("input bitcode"));
+
+static cl::list<std::string>
+ ExtraObjects("extra-object", cl::desc("Extra object files to be loaded"),
+ cl::value_desc("input object"));
+
+static cl::list<std::string>
+ ExtraArchives("extra-archive", cl::desc("Extra archive files to be loaded"),
+ cl::value_desc("input archive"));
+
+static cl::opt<bool>
+ EnableCacheManager("enable-cache-manager",
+ cl::desc("Use cache manager to save/load modules"),
+ cl::init(false));
+
+static cl::opt<std::string>
+ ObjectCacheDir("object-cache-dir",
+ cl::desc("Directory to store cached object files "
+ "(must be user writable)"),
+ cl::init(""));
+
+static cl::opt<std::string>
+ FakeArgv0("fake-argv0",
+ cl::desc("Override the 'argv[0]' value passed into the executing"
+ " program"),
+ cl::value_desc("executable"));
+
+static cl::opt<bool>
+ DisableCoreFiles("disable-core-files", cl::Hidden,
+ cl::desc("Disable emission of core files if possible"));
+
+static cl::opt<bool> NoLazyCompilation("disable-lazy-compilation",
+ cl::desc("Disable JIT lazy compilation"),
+ cl::init(false));
+
+static cl::opt<bool> GenerateSoftFloatCalls(
+ "soft-float", cl::desc("Generate software floating point library calls"),
cl::init(false));
- cl::opt<bool> NoProcessSymbols(
- "no-process-syms",
- cl::desc("Do not resolve lli process symbols in JIT'd code"),
- cl::init(false));
-
- enum class LLJITPlatform { Inactive, Auto, ExecutorNative, GenericIR };
-
- cl::opt<LLJITPlatform> Platform(
- "lljit-platform", cl::desc("Platform to use with LLJIT"),
- cl::init(LLJITPlatform::Auto),
- cl::values(clEnumValN(LLJITPlatform::Auto, "Auto",
- "Like 'ExecutorNative' if ORC runtime "
- "provided, otherwise like 'GenericIR'"),
- clEnumValN(LLJITPlatform::ExecutorNative, "ExecutorNative",
- "Use the native platform for the executor."
- "Requires -orc-runtime"),
- clEnumValN(LLJITPlatform::GenericIR, "GenericIR",
- "Use LLJITGenericIRPlatform"),
- clEnumValN(LLJITPlatform::Inactive, "Inactive",
- "Disable platform support explicitly")),
- cl::Hidden);
-
- enum class DumpKind {
- NoDump,
- DumpFuncsToStdOut,
- DumpModsToStdOut,
- DumpModsToDisk,
- DumpDebugDescriptor,
- DumpDebugObjects,
- };
+static cl::opt<bool> NoProcessSymbols(
+ "no-process-syms",
+ cl::desc("Do not resolve lli process symbols in JIT'd code"),
+ cl::init(false));
- cl::opt<DumpKind> OrcDumpKind(
- "orc-lazy-debug", cl::desc("Debug dumping for the orc-lazy JIT."),
- cl::init(DumpKind::NoDump),
- cl::values(
- clEnumValN(DumpKind::NoDump, "no-dump", "Don't dump anything."),
- clEnumValN(DumpKind::DumpFuncsToStdOut, "funcs-to-stdout",
- "Dump function names to stdout."),
- clEnumValN(DumpKind::DumpModsToStdOut, "mods-to-stdout",
- "Dump modules to stdout."),
- clEnumValN(DumpKind::DumpModsToDisk, "mods-to-disk",
- "Dump modules to the current "
- "working directory. (WARNING: "
- "will overwrite existing files)."),
- clEnumValN(DumpKind::DumpDebugDescriptor, "jit-debug-descriptor",
- "Dump __jit_debug_descriptor contents to stdout"),
- clEnumValN(DumpKind::DumpDebugObjects, "jit-debug-objects",
- "Dump __jit_debug_descriptor in-memory debug "
- "objects as tool output")),
- cl::Hidden);
-
- ExitOnError ExitOnErr;
-}
+enum class LLJITPlatform { Inactive, Auto, ExecutorNative, GenericIR };
+
+static cl::opt<LLJITPlatform> Platform(
+ "lljit-platform", cl::desc("Platform to use with LLJIT"),
+ cl::init(LLJITPlatform::Auto),
+ cl::values(clEnumValN(LLJITPlatform::Auto, "Auto",
+ "Like 'ExecutorNative' if ORC runtime "
+ "provided, otherwise like 'GenericIR'"),
+ clEnumValN(LLJITPlatform::ExecutorNative, "ExecutorNative",
+ "Use the native platform for the executor."
+ "Requires -orc-runtime"),
+ clEnumValN(LLJITPlatform::GenericIR, "GenericIR",
+ "Use LLJITGenericIRPlatform"),
+ clEnumValN(LLJITPlatform::Inactive, "Inactive",
+ "Disable platform support explicitly")),
+ cl::Hidden);
+
+enum class DumpKind {
+ NoDump,
+ DumpFuncsToStdOut,
+ DumpModsToStdOut,
+ DumpModsToDisk,
+ DumpDebugDescriptor,
+ DumpDebugObjects,
+};
-LLVM_ATTRIBUTE_USED void linkComponents() {
+static cl::opt<DumpKind> OrcDumpKind(
+ "orc-lazy-debug", cl::desc("Debug dumping for the orc-lazy JIT."),
+ cl::init(DumpKind::NoDump),
+ cl::values(clEnumValN(DumpKind::NoDump, "no-dump", "Don't dump anything."),
+ clEnumValN(DumpKind::DumpFuncsToStdOut, "funcs-to-stdout",
+ "Dump function names to stdout."),
+ clEnumValN(DumpKind::DumpModsToStdOut, "mods-to-stdout",
+ "Dump modules to stdout."),
+ clEnumValN(DumpKind::DumpModsToDisk, "mods-to-disk",
+ "Dump modules to the current "
+ "working directory. (WARNING: "
+ "will overwrite existing files)."),
+ clEnumValN(DumpKind::DumpDebugDescriptor, "jit-debug-descriptor",
+ "Dump __jit_debug_descriptor contents to stdout"),
+ clEnumValN(DumpKind::DumpDebugObjects, "jit-debug-objects",
+ "Dump __jit_debug_descriptor in-memory debug "
+ "objects as tool output")),
+ cl::Hidden);
+
+static ExitOnError ExitOnErr;
+
+LLVM_ATTRIBUTE_USED static void linkComponents() {
errs() << (void *)&llvm_orc_registerEHFrameSectionAllocAction
<< (void *)&llvm_orc_deregisterEHFrameSectionAllocAction
<< (void *)&llvm_orc_registerJITLoaderGDBWrapper
<< (void *)&llvm_orc_registerJITLoaderGDBAllocAction;
}
+namespace {
//===----------------------------------------------------------------------===//
// Object cache
//
@@ -367,6 +363,7 @@ private:
return true;
}
};
+} // namespace
// On Mingw and Cygwin, an external symbol named '__main' is called from the
// generated 'main' function to allow static initialization. To avoid linking
@@ -400,7 +397,7 @@ static void addCygMingExtraModule(ExecutionEngine &EE, LLVMContext &Context,
EE.addModule(std::move(M));
}
-CodeGenOptLevel getOptLevel() {
+static CodeGenOptLevel getOptLevel() {
if (auto Level = CodeGenOpt::parseLevel(OptLevel))
return *Level;
WithColor::error(errs(), "lli") << "invalid optimization level.\n";
@@ -412,10 +409,10 @@ CodeGenOptLevel getOptLevel() {
exit(1);
}
-Error loadDylibs();
-int runOrcJIT(const char *ProgName);
-void disallowOrcOptions();
-Expected<std::unique_ptr<orc::ExecutorProcessControl>> launchRemote();
+static Error loadDylibs();
+static int runOrcJIT(const char *ProgName);
+static void disallowOrcOptions();
+static Expected<std::unique_ptr<orc::ExecutorProcessControl>> launchRemote();
//===----------------------------------------------------------------------===//
// main Driver function
@@ -863,7 +860,7 @@ static std::function<void(MemoryBuffer &)> createObjDebugDumper() {
llvm_unreachable("Unknown DumpKind");
}
-Error loadDylibs() {
+static Error loadDylibs() {
for (const auto &Dylib : Dylibs) {
std::string ErrMsg;
if (sys::DynamicLibrary::LoadLibraryPermanently(Dylib.c_str(), &ErrMsg))
@@ -875,7 +872,7 @@ Error loadDylibs() {
static void exitOnLazyCallThroughFailure() { exit(1); }
-Expected<orc::ThreadSafeModule>
+static Expected<orc::ThreadSafeModule>
loadModule(StringRef Path, orc::ThreadSafeContext TSCtx) {
SMDiagnostic Err;
auto M = TSCtx.withContextDo(
@@ -895,7 +892,7 @@ loadModule(StringRef Path, orc::ThreadSafeContext TSCtx) {
return orc::ThreadSafeModule(std::move(M), std::move(TSCtx));
}
-int mingw_noop_main(void) {
+static int mingw_noop_main(void) {
// Cygwin and MinGW insert calls from the main function to the runtime
// function __main. The __main function is responsible for setting up main's
// environment (e.g. running static constructors), however this is not needed
@@ -912,7 +909,7 @@ int mingw_noop_main(void) {
// Try to enable debugger support for the given instance.
// This alway returns success, but prints a warning if it's not able to enable
// debugger support.
-Error tryEnableDebugSupport(orc::LLJIT &J) {
+static Error tryEnableDebugSupport(orc::LLJIT &J) {
if (auto Err = enableDebuggerSupport(J)) {
[[maybe_unused]] std::string ErrMsg = toString(std::move(Err));
LLVM_DEBUG(dbgs() << "lli: " << ErrMsg << "\n");
@@ -920,7 +917,7 @@ Error tryEnableDebugSupport(orc::LLJIT &J) {
return Error::success();
}
-int runOrcJIT(const char *ProgName) {
+static int runOrcJIT(const char *ProgName) {
// Start setting up the JIT environment.
// Parse the main module.
@@ -1187,7 +1184,7 @@ int runOrcJIT(const char *ProgName) {
return Result;
}
-void disallowOrcOptions() {
+static void disallowOrcOptions() {
// Make sure nobody used an orc-lazy specific option accidentally.
if (LazyJITCompileThreads != 0) {
@@ -1206,7 +1203,7 @@ void disallowOrcOptions() {
}
}
-Expected<std::unique_ptr<orc::ExecutorProcessControl>> launchRemote() {
+static Expected<std::unique_ptr<orc::ExecutorProcessControl>> launchRemote() {
#ifndef LLVM_ON_UNIX
llvm_unreachable("launchRemote not supported on non-Unix platforms");
#else
diff --git a/llvm/tools/llvm-c-test/debuginfo.c b/llvm/tools/llvm-c-test/debuginfo.c
index a2f4b3e..9db7aa0 100644
--- a/llvm/tools/llvm-c-test/debuginfo.c
+++ b/llvm/tools/llvm-c-test/debuginfo.c
@@ -43,6 +43,9 @@ int llvm_test_dibuilder(void) {
LLVMMetadataRef File = LLVMDIBuilderCreateFile(DIB, Filename,
strlen(Filename), ".", 1);
+ LLVMMetadataRef FileCS = LLVMDIBuilderCreateFileWithChecksum(
+ DIB, Filename, strlen(Filename), ".", 1, CSK_MD5, "1234", 4, "source", 6);
+
LLVMMetadataRef CompileUnit = LLVMDIBuilderCreateCompileUnit(
DIB, LLVMDWARFSourceLanguageC, File, "llvm-c-test", 11, 0, NULL, 0, 0,
NULL, 0, LLVMDWARFEmissionFull, 0, 0, 0, "/", 1, "", 0);
@@ -61,7 +64,7 @@ int llvm_test_dibuilder(void) {
"/test/include/llvm-c-test-import.h", 34,
"", 0);
LLVMMetadataRef ImportedModule = LLVMDIBuilderCreateImportedModuleFromModule(
- DIB, Module, OtherModule, File, 42, NULL, 0);
+ DIB, Module, OtherModule, FileCS, 42, NULL, 0);
LLVMDIBuilderCreateImportedModuleFromAlias(DIB, Module, ImportedModule, File,
42, NULL, 0);
diff --git a/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp b/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp
index de83a0d..4c08b57 100644
--- a/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp
+++ b/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp
@@ -386,7 +386,9 @@ static llvm::Error handleObjectFile(ObjectFile &Obj, const std::string &OutFile,
// Make a DWARF transformer object and populate the ranges of the code
// so we don't end up adding invalid functions to GSYM data.
- DwarfTransformer DT(*DICtx, Gsym, LoadDwarfCallSites);
+ bool IsMachO = dyn_cast<object::MachOObjectFile>(&Obj) != nullptr;
+
+ DwarfTransformer DT(*DICtx, Gsym, LoadDwarfCallSites, IsMachO);
if (!TextRanges.empty())
Gsym.SetValidTextRanges(TextRanges);
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
index 731d648..79216e8 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
@@ -40,6 +40,7 @@
#include "llvm/ExecutionEngine/Orc/SectCreate.h"
#include "llvm/ExecutionEngine/Orc/SelfExecutorProcessControl.h"
#include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
+#include "llvm/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.h"
#include "llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.h"
#include "llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.h"
#include "llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderVTune.h"
@@ -312,10 +313,19 @@ static cl::opt<bool>
cl::desc("Show FailedToMaterialize errors"),
cl::init(false), cl::cat(JITLinkCategory));
-static cl::opt<bool> UseSharedMemory(
- "use-shared-memory",
- cl::desc("Use shared memory to transfer generated code and data"),
- cl::init(false), cl::cat(JITLinkCategory));
+enum class MemMgr { Default, Generic, SimpleRemote, Shared };
+
+static cl::opt<MemMgr> UseMemMgr(
+ "use-memmgr", cl::desc("Choose memory manager"), cl::init(MemMgr::Generic),
+ cl::values(clEnumValN(MemMgr::Default, "default",
+ "Use setup default (InProcess or EPCGeneric)"),
+ clEnumValN(MemMgr::Generic, "generic",
+ "Generic remote memory manager"),
+ clEnumValN(MemMgr::SimpleRemote, "simple-remote",
+ "Mapper memory manager with simple-remote backend"),
+ clEnumValN(MemMgr::Shared, "shared",
+ "Mapper memory manager with shared-memory manager")),
+ cl::cat(JITLinkCategory));
static cl::opt<std::string>
OverrideTriple("triple", cl::desc("Override target triple detection"),
@@ -623,8 +633,9 @@ public:
});
}
- char *prepare(ExecutorAddr Addr, size_t ContentSize) override {
- return InProcessMemoryMapper::prepare(Addr - DeltaAddr, ContentSize);
+ char *prepare(jitlink::LinkGraph &G, ExecutorAddr Addr,
+ size_t ContentSize) override {
+ return InProcessMemoryMapper::prepare(G, Addr - DeltaAddr, ContentSize);
}
void initialize(AllocInfo &AI, OnInitializedFunction OnInitialized) override {
@@ -717,6 +728,27 @@ static std::unique_ptr<JITLinkMemoryManager> createInProcessMemoryManager() {
}
Expected<std::unique_ptr<jitlink::JITLinkMemoryManager>>
+createSimpleRemoteMemoryManager(SimpleRemoteEPC &SREPC) {
+ SimpleRemoteMemoryMapper::SymbolAddrs SAs;
+ if (auto Err = SREPC.getBootstrapSymbols(
+ {{SAs.Instance, rt::SimpleExecutorMemoryManagerInstanceName},
+ {SAs.Reserve, rt::SimpleExecutorMemoryManagerReserveWrapperName},
+ {SAs.Initialize,
+ rt::SimpleExecutorMemoryManagerInitializeWrapperName},
+ {SAs.Deinitialize,
+ rt::SimpleExecutorMemoryManagerDeinitializeWrapperName},
+ {SAs.Release, rt::SimpleExecutorMemoryManagerReleaseWrapperName}}))
+ return std::move(Err);
+#ifdef _WIN32
+ size_t SlabSize = 1024 * 1024;
+#else
+ size_t SlabSize = 1024 * 1024 * 1024;
+#endif
+ return MapperJITLinkMemoryManager::CreateWithMapper<SimpleRemoteMemoryMapper>(
+ SlabSize, SREPC, SAs);
+}
+
+Expected<std::unique_ptr<jitlink::JITLinkMemoryManager>>
createSharedMemoryManager(SimpleRemoteEPC &SREPC) {
SharedMemoryMapper::SymbolAddrs SAs;
if (auto Err = SREPC.getBootstrapSymbols(
@@ -744,6 +776,19 @@ createSharedMemoryManager(SimpleRemoteEPC &SREPC) {
SlabSize, SREPC, SAs);
}
+static void setupEPCRemoteMemoryManager(SimpleRemoteEPC::Setup &S) {
+ switch (UseMemMgr) {
+ case MemMgr::Default:
+ case MemMgr::Generic:
+ break;
+ case MemMgr::SimpleRemote:
+ S.CreateMemoryManager = createSimpleRemoteMemoryManager;
+ break;
+ case MemMgr::Shared:
+ S.CreateMemoryManager = createSharedMemoryManager;
+ break;
+ }
+}
static Expected<MaterializationUnit::Interface>
getTestObjectFileInterface(Session &S, MemoryBufferRef O) {
@@ -903,8 +948,7 @@ static Expected<std::unique_ptr<ExecutorProcessControl>> launchExecutor() {
close(FromExecutor[WriteEnd]);
auto S = SimpleRemoteEPC::Setup();
- if (UseSharedMemory)
- S.CreateMemoryManager = createSharedMemoryManager;
+ setupEPCRemoteMemoryManager(S);
return SimpleRemoteEPC::Create<FDSimpleRemoteEPCTransport>(
std::make_unique<DynamicThreadPoolTaskDispatcher>(MaterializationThreads),
@@ -993,8 +1037,7 @@ static Expected<std::unique_ptr<ExecutorProcessControl>> connectToExecutor() {
return SockFD.takeError();
auto S = SimpleRemoteEPC::Setup();
- if (UseSharedMemory)
- S.CreateMemoryManager = createSharedMemoryManager;
+ setupEPCRemoteMemoryManager(S);
return SimpleRemoteEPC::Create<FDSimpleRemoteEPCTransport>(
std::make_unique<DynamicThreadPoolTaskDispatcher>(std::nullopt),
diff --git a/llvm/tools/llvm-mc-assemble-fuzzer/llvm-mc-assemble-fuzzer.cpp b/llvm/tools/llvm-mc-assemble-fuzzer/llvm-mc-assemble-fuzzer.cpp
index fa56d0d..fb5c0bf 100644
--- a/llvm/tools/llvm-mc-assemble-fuzzer/llvm-mc-assemble-fuzzer.cpp
+++ b/llvm/tools/llvm-mc-assemble-fuzzer/llvm-mc-assemble-fuzzer.cpp
@@ -31,6 +31,7 @@
#include "llvm/Support/SourceMgr.h"
#include "llvm/Support/TargetSelect.h"
#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/VirtualFileSystem.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/TargetParser/Host.h"
#include "llvm/TargetParser/SubtargetFeature.h"
@@ -142,6 +143,7 @@ int AssembleOneInput(const uint8_t *Data, size_t Size) {
static const std::vector<std::string> NoIncludeDirs;
SrcMgr.setIncludeDirs(NoIncludeDirs);
+ SrcMgr.setVirtualFileSystem(vfs::getRealFileSystem());
static std::string ArchName;
std::string Error;
diff --git a/llvm/tools/llvm-mc/llvm-mc.cpp b/llvm/tools/llvm-mc/llvm-mc.cpp
index 2a89961..3b2d4f8 100644
--- a/llvm/tools/llvm-mc/llvm-mc.cpp
+++ b/llvm/tools/llvm-mc/llvm-mc.cpp
@@ -40,6 +40,7 @@
#include "llvm/Support/TargetSelect.h"
#include "llvm/Support/TimeProfiler.h"
#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/VirtualFileSystem.h"
#include "llvm/Support/WithColor.h"
#include "llvm/TargetParser/Host.h"
#include <memory>
@@ -439,6 +440,7 @@ int main(int argc, char **argv) {
// Record the location of the include directories so that the lexer can find
// it later.
SrcMgr.setIncludeDirs(IncludeDirs);
+ SrcMgr.setVirtualFileSystem(vfs::getRealFileSystem());
std::unique_ptr<MCRegisterInfo> MRI(TheTarget->createMCRegInfo(TheTriple));
assert(MRI && "Unable to create target register info!");
diff --git a/llvm/tools/llvm-ml/llvm-ml.cpp b/llvm/tools/llvm-ml/llvm-ml.cpp
index cda86e7..7b88576 100644
--- a/llvm/tools/llvm-ml/llvm-ml.cpp
+++ b/llvm/tools/llvm-ml/llvm-ml.cpp
@@ -41,6 +41,7 @@
#include "llvm/Support/SourceMgr.h"
#include "llvm/Support/TargetSelect.h"
#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/VirtualFileSystem.h"
#include "llvm/Support/WithColor.h"
#include "llvm/TargetParser/Host.h"
#include <ctime>
@@ -313,6 +314,7 @@ int llvm_ml_main(int Argc, char **Argv, const llvm::ToolContext &) {
}
}
SrcMgr.setIncludeDirs(IncludeDirs);
+ SrcMgr.setVirtualFileSystem(vfs::getRealFileSystem());
std::unique_ptr<MCRegisterInfo> MRI(TheTarget->createMCRegInfo(TheTriple));
assert(MRI && "Unable to create target register info!");
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
index 46be539d..3ec644a 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -728,11 +728,17 @@ public:
} while (!Comments.empty());
FOS.flush();
}
+
+ // Hook invoked when starting to disassemble a symbol at the current position.
+ // Default is no-op.
+ virtual void onSymbolStart() {}
};
PrettyPrinter PrettyPrinterInst;
class HexagonPrettyPrinter : public PrettyPrinter {
public:
+ void onSymbolStart() override { reset(); }
+
void printLead(ArrayRef<uint8_t> Bytes, uint64_t Address,
formatted_raw_ostream &OS) {
if (LeadingAddr)
@@ -2228,6 +2234,8 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
Start += Size;
break;
}
+ // Allow targets to reset any per-symbol state.
+ DT->Printer->onSymbolStart();
formatted_raw_ostream FOS(OS);
Index = Start;
if (SectionAddr < StartAddress)
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index ab93316..9c9b2dd 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -8155,6 +8155,8 @@ void LLVMELFDumper<ELFT>::printBBAddrMaps(bool PrettyPGOAnalysis) {
W.printHex("Offset", BBE.Offset);
if (!BBE.CallsiteEndOffsets.empty())
W.printList("Callsite End Offsets", BBE.CallsiteEndOffsets);
+ if (PAM.FeatEnable.BBHash)
+ W.printHex("Hash", BBE.Hash);
W.printHex("Size", BBE.Size);
W.printBoolean("HasReturn", BBE.hasReturn());
W.printBoolean("HasTailCall", BBE.hasTailCall());
diff --git a/llvm/tools/obj2yaml/elf2yaml.cpp b/llvm/tools/obj2yaml/elf2yaml.cpp
index ef4552f..68e18f6 100644
--- a/llvm/tools/obj2yaml/elf2yaml.cpp
+++ b/llvm/tools/obj2yaml/elf2yaml.cpp
@@ -900,7 +900,7 @@ ELFDumper<ELFT>::dumpBBAddrMapSection(const Elf_Shdr *Shdr) {
while (Cur && Cur.tell() < Content.size()) {
if (Shdr->sh_type == ELF::SHT_LLVM_BB_ADDR_MAP) {
Version = Data.getU8(Cur);
- if (Cur && Version > 3)
+ if (Cur && Version > 4)
return createStringError(
errc::invalid_argument,
"invalid SHT_LLVM_BB_ADDR_MAP section version: " +
@@ -946,8 +946,11 @@ ELFDumper<ELFT>::dumpBBAddrMapSection(const Elf_Shdr *Shdr) {
}
uint64_t Size = Data.getULEB128(Cur);
uint64_t Metadata = Data.getULEB128(Cur);
+ std::optional<llvm::yaml::Hex64> Hash;
+ if (FeatureOrErr->BBHash)
+ Hash = Data.getU64(Cur);
BBEntries.push_back(
- {ID, Offset, Size, Metadata, std::move(CallsiteEndOffsets)});
+ {ID, Offset, Size, Metadata, std::move(CallsiteEndOffsets), Hash});
}
TotalNumBlocks += BBEntries.size();
BBRanges.push_back({BaseAddress, /*NumBlocks=*/{}, BBEntries});
diff --git a/llvm/unittests/ADT/BitFieldsTest.cpp b/llvm/unittests/ADT/BitFieldsTest.cpp
index 3062d5d..ae541fe 100644
--- a/llvm/unittests/ADT/BitFieldsTest.cpp
+++ b/llvm/unittests/ADT/BitFieldsTest.cpp
@@ -247,8 +247,8 @@ TEST(BitfieldsTest, ValueTooBigBounded) {
Bitfield::set<A>(Storage, 0);
Bitfield::set<A>(Storage, -1);
Bitfield::set<A>(Storage, -2);
- EXPECT_DEBUG_DEATH(Bitfield::set<A>(Storage, 2), "value is too big");
- EXPECT_DEBUG_DEATH(Bitfield::set<A>(Storage, -3), "value is too small");
+ EXPECT_DEBUG_DEATH(Bitfield::set<A>(Storage, 2), "value is out of range");
+ EXPECT_DEBUG_DEATH(Bitfield::set<A>(Storage, -3), "value is out of range");
}
#endif
diff --git a/llvm/unittests/ADT/SmallVectorTest.cpp b/llvm/unittests/ADT/SmallVectorTest.cpp
index 137dd43..e2e778f 100644
--- a/llvm/unittests/ADT/SmallVectorTest.cpp
+++ b/llvm/unittests/ADT/SmallVectorTest.cpp
@@ -127,24 +127,24 @@ public:
return c0.getValue() == c1.getValue();
}
- friend bool LLVM_ATTRIBUTE_UNUSED operator!=(const Constructable &c0,
- const Constructable &c1) {
+ [[maybe_unused]] friend bool operator!=(const Constructable &c0,
+ const Constructable &c1) {
return c0.getValue() != c1.getValue();
}
friend bool operator<(const Constructable &c0, const Constructable &c1) {
return c0.getValue() < c1.getValue();
}
- friend bool LLVM_ATTRIBUTE_UNUSED operator<=(const Constructable &c0,
- const Constructable &c1) {
+ [[maybe_unused]] friend bool operator<=(const Constructable &c0,
+ const Constructable &c1) {
return c0.getValue() <= c1.getValue();
}
- friend bool LLVM_ATTRIBUTE_UNUSED operator>(const Constructable &c0,
- const Constructable &c1) {
+ [[maybe_unused]] friend bool operator>(const Constructable &c0,
+ const Constructable &c1) {
return c0.getValue() > c1.getValue();
}
- friend bool LLVM_ATTRIBUTE_UNUSED operator>=(const Constructable &c0,
- const Constructable &c1) {
+ [[maybe_unused]] friend bool operator>=(const Constructable &c0,
+ const Constructable &c1) {
return c0.getValue() >= c1.getValue();
}
};
diff --git a/llvm/unittests/ADT/StringExtrasTest.cpp b/llvm/unittests/ADT/StringExtrasTest.cpp
index fbaed38..af88f889 100644
--- a/llvm/unittests/ADT/StringExtrasTest.cpp
+++ b/llvm/unittests/ADT/StringExtrasTest.cpp
@@ -290,6 +290,12 @@ TEST(StringExtrasTest, ListSeparator) {
EXPECT_EQ(S, "");
S = LS2;
EXPECT_EQ(S, " ");
+
+ ListSeparator LS3(",", "{");
+ S = LS3;
+ EXPECT_EQ(S, "{");
+ S = LS3;
+ EXPECT_EQ(S, ",");
}
TEST(StringExtrasTest, toStringAPInt) {
diff --git a/llvm/unittests/ADT/StringSwitchTest.cpp b/llvm/unittests/ADT/StringSwitchTest.cpp
index bcb1521..0fbf371 100644
--- a/llvm/unittests/ADT/StringSwitchTest.cpp
+++ b/llvm/unittests/ADT/StringSwitchTest.cpp
@@ -153,13 +153,14 @@ TEST(StringSwitchTest, EndsWithLower) {
}
TEST(StringSwitchTest, Cases) {
- enum class OSType { Windows, Linux, Unknown };
+ enum class OSType { Windows, Linux, MacOS, Unknown };
auto Translate = [](StringRef S) {
return llvm::StringSwitch<OSType>(S)
.Cases(StringLiteral::withInnerNUL("wind\0ws"), "win32", "winnt",
OSType::Windows)
.Cases("linux", "unix", "*nix", "posix", OSType::Linux)
+ .Cases({"macos", "osx"}, OSType::MacOS)
.Default(OSType::Unknown);
};
@@ -172,21 +173,26 @@ TEST(StringSwitchTest, Cases) {
EXPECT_EQ(OSType::Linux, Translate("*nix"));
EXPECT_EQ(OSType::Linux, Translate("posix"));
+ EXPECT_EQ(OSType::MacOS, Translate("macos"));
+ EXPECT_EQ(OSType::MacOS, Translate("osx"));
+
// Note that the whole null-terminator embedded string is required for the
// case to match.
EXPECT_EQ(OSType::Unknown, Translate("wind"));
EXPECT_EQ(OSType::Unknown, Translate("Windows"));
+ EXPECT_EQ(OSType::Unknown, Translate("MacOS"));
EXPECT_EQ(OSType::Unknown, Translate(""));
}
TEST(StringSwitchTest, CasesLower) {
- enum class OSType { Windows, Linux, Unknown };
+ enum class OSType { Windows, Linux, MacOS, Unknown };
auto Translate = [](StringRef S) {
return llvm::StringSwitch<OSType>(S)
.CasesLower(StringLiteral::withInnerNUL("wind\0ws"), "win32", "winnt",
OSType::Windows)
.CasesLower("linux", "unix", "*nix", "posix", OSType::Linux)
+ .CasesLower({"macos", "osx"}, OSType::MacOS)
.Default(OSType::Unknown);
};
@@ -202,6 +208,9 @@ TEST(StringSwitchTest, CasesLower) {
EXPECT_EQ(OSType::Windows, Translate(llvm::StringRef("wind\0ws", 7)));
EXPECT_EQ(OSType::Linux, Translate("linux"));
+ EXPECT_EQ(OSType::MacOS, Translate("macOS"));
+ EXPECT_EQ(OSType::MacOS, Translate("OSX"));
+
EXPECT_EQ(OSType::Unknown, Translate("wind"));
EXPECT_EQ(OSType::Unknown, Translate(""));
}
diff --git a/llvm/unittests/ADT/TypeTraitsTest.cpp b/llvm/unittests/ADT/TypeTraitsTest.cpp
index a56aa7e..f9b8d6d 100644
--- a/llvm/unittests/ADT/TypeTraitsTest.cpp
+++ b/llvm/unittests/ADT/TypeTraitsTest.cpp
@@ -40,9 +40,7 @@ struct Foo {
struct CheckMethodPointer : CheckFunctionTraits<decltype(&Foo::func)> {};
/// Test lambda references.
-LLVM_ATTRIBUTE_UNUSED auto lambdaFunc = [](const int &v) -> bool {
- return true;
-};
+[[maybe_unused]] auto lambdaFunc = [](const int &v) -> bool { return true; };
struct CheckLambda : CheckFunctionTraits<decltype(lambdaFunc)> {};
} // end anonymous namespace
diff --git a/llvm/unittests/Analysis/DXILResourceTest.cpp b/llvm/unittests/Analysis/DXILResourceTest.cpp
index ee37fad..8c3a213 100644
--- a/llvm/unittests/Analysis/DXILResourceTest.cpp
+++ b/llvm/unittests/Analysis/DXILResourceTest.cpp
@@ -369,10 +369,8 @@ TEST(DXILResource, AnnotationsAndMetadata) {
{
StructType *CBufStruct =
StructType::create(Context, {Floatx4Ty, Floatx4Ty}, "cb0");
- TargetExtType *CBufLayoutType =
- llvm::TargetExtType::get(Context, "dx.Layout", CBufStruct, {32, 0, 16});
ResourceTypeInfo RTI(
- llvm::TargetExtType::get(Context, "dx.CBuffer", CBufLayoutType));
+ llvm::TargetExtType::get(Context, "dx.CBuffer", CBufStruct));
EXPECT_EQ(RTI.getResourceClass(), ResourceClass::CBuffer);
EXPECT_EQ(RTI.getCBufferSize(DL), 32u);
EXPECT_EQ(RTI.getResourceKind(), ResourceKind::CBuffer);
diff --git a/llvm/unittests/Analysis/ScalarEvolutionTest.cpp b/llvm/unittests/Analysis/ScalarEvolutionTest.cpp
index 1a68823..5d7eded 100644
--- a/llvm/unittests/Analysis/ScalarEvolutionTest.cpp
+++ b/llvm/unittests/Analysis/ScalarEvolutionTest.cpp
@@ -11,6 +11,7 @@
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/ScalarEvolutionNormalization.h"
+#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/AsmParser/Parser.h"
#include "llvm/IR/Constants.h"
@@ -26,6 +27,8 @@
namespace llvm {
+using namespace SCEVPatternMatch;
+
// We use this fixture to ensure that we clean up ScalarEvolution before
// deleting the PassManager.
class ScalarEvolutionsTest : public testing::Test {
@@ -64,11 +67,6 @@ static std::optional<APInt> computeConstantDifference(ScalarEvolution &SE,
return SE.computeConstantDifference(LHS, RHS);
}
- static bool matchURem(ScalarEvolution &SE, const SCEV *Expr, const SCEV *&LHS,
- const SCEV *&RHS) {
- return SE.matchURem(Expr, LHS, RHS);
- }
-
static bool isImpliedCond(
ScalarEvolution &SE, ICmpInst::Predicate Pred, const SCEV *LHS,
const SCEV *RHS, ICmpInst::Predicate FoundPred, const SCEV *FoundLHS,
@@ -1524,7 +1522,7 @@ TEST_F(ScalarEvolutionsTest, MatchURem) {
auto *URemI = getInstructionByName(F, N);
auto *S = SE.getSCEV(URemI);
const SCEV *LHS, *RHS;
- EXPECT_TRUE(matchURem(SE, S, LHS, RHS));
+ EXPECT_TRUE(match(S, m_scev_URem(m_SCEV(LHS), m_SCEV(RHS), SE)));
EXPECT_EQ(LHS, SE.getSCEV(URemI->getOperand(0)));
EXPECT_EQ(RHS, SE.getSCEV(URemI->getOperand(1)));
EXPECT_EQ(LHS->getType(), S->getType());
@@ -1537,7 +1535,7 @@ TEST_F(ScalarEvolutionsTest, MatchURem) {
auto *URem1 = getInstructionByName(F, "rem4");
auto *S = SE.getSCEV(Ext);
const SCEV *LHS, *RHS;
- EXPECT_TRUE(matchURem(SE, S, LHS, RHS));
+ EXPECT_TRUE(match(S, m_scev_URem(m_SCEV(LHS), m_SCEV(RHS), SE)));
EXPECT_NE(LHS, SE.getSCEV(URem1->getOperand(0)));
// RHS and URem1->getOperand(1) have different widths, so compare the
// integer values.
diff --git a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
index 3ab2caf..57e15a4 100644
--- a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
+++ b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
@@ -39,20 +39,21 @@ TEST(DataLayoutUpgradeTest, ValidDataLayoutUpgrade) {
"64-i128:128-n32:64-S128-Fn32");
// Check that AMDGPU targets add -G1 if it's not present.
- EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32", "r600"), "e-p:32:32-G1");
+ EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32", "r600"), "m:e-e-p:32:32-G1");
// and that ANDGCN adds p7 and p8 as well.
EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64", "amdgcn"),
- "e-p:64:64-G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:192:"
- "256:256:32");
+ "m:e-e-p:64:64-G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:"
+ "192:256:256:32");
EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-G1", "amdgcn"),
- "e-p:64:64-G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:192:"
- "256:256:32");
+ "m:e-e-p:64:64-G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:"
+ "192:256:256:32");
// Check that the old AMDGCN p8:128:128 definition is upgraded
EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-p8:128:128-G1", "amdgcn"),
- "e-p:64:64-p8:128:128:128:48-G1-ni:7:8:9-p7:160:256:256:32-"
- "p9:192:256:256:32");
+ "m:e-e-p:64:64-p8:128:128:128:48-G1-ni:7:8:9-p7:160:256:256:32-p9:"
+ "192:256:256:32");
// but that r600 does not.
- EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32-G1", "r600"), "e-p:32:32-G1");
+ EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32-G1", "r600"),
+ "m:e-e-p:32:32-G1");
// Ensure that the non-integral direction for address space 8 doesn't get
// added in to pointer declarations.
@@ -62,11 +63,10 @@ TEST(DataLayoutUpgradeTest, ValidDataLayoutUpgrade) {
"64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-"
"v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7",
"amdgcn"),
- "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-"
- "v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:"
+ "m:e-e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:"
+ "64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:"
"1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:"
- "128:48-"
- "p9:192:256:256:32");
+ "128:48-p9:192:256:256:32");
// Check that RISCV64 upgrades -n64 to -n32:64.
EXPECT_EQ(UpgradeDataLayoutString("e-m:e-p:64:64-i64:64-i128:128-n64-S128",
@@ -147,28 +147,29 @@ TEST(DataLayoutUpgradeTest, NoDataLayoutUpgrade) {
"64-S128-Fn32");
// Check that AMDGPU targets don't add -G1 if there is already a -G flag.
- EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32-G2", "r600"), "e-p:32:32-G2");
- EXPECT_EQ(UpgradeDataLayoutString("G2", "r600"), "G2");
+ EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32-G2", "r600"),
+ "m:e-e-p:32:32-G2");
+ EXPECT_EQ(UpgradeDataLayoutString("G2", "r600"), "m:e-G2");
EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-G2", "amdgcn"),
- "e-p:64:64-G2-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:192:"
- "256:256:32");
+ "m:e-e-p:64:64-G2-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:"
+ "192:256:256:32");
EXPECT_EQ(UpgradeDataLayoutString("G2-e-p:64:64", "amdgcn"),
- "G2-e-p:64:64-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:192:"
- "256:256:32");
+ "m:e-G2-e-p:64:64-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:"
+ "192:256:256:32");
EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-G0", "amdgcn"),
- "e-p:64:64-G0-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:192:"
- "256:256:32");
+ "m:e-e-p:64:64-G0-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:"
+ "192:256:256:32");
// Check that AMDGCN targets don't add already declared address space 7.
EXPECT_EQ(
UpgradeDataLayoutString("e-p:64:64-p7:64:64", "amdgcn"),
- "e-p:64:64-p7:64:64-G1-ni:7:8:9-p8:128:128:128:48-p9:192:256:256:32");
+ "m:e-e-p:64:64-p7:64:64-G1-ni:7:8:9-p8:128:128:128:48-p9:192:256:256:32");
EXPECT_EQ(
UpgradeDataLayoutString("p7:64:64-G2-e-p:64:64", "amdgcn"),
- "p7:64:64-G2-e-p:64:64-ni:7:8:9-p8:128:128:128:48-p9:192:256:256:32");
+ "m:e-p7:64:64-G2-e-p:64:64-ni:7:8:9-p8:128:128:128:48-p9:192:256:256:32");
EXPECT_EQ(
UpgradeDataLayoutString("e-p:64:64-p7:64:64-G1", "amdgcn"),
- "e-p:64:64-p7:64:64-G1-ni:7:8:9-p8:128:128:128:48-p9:192:256:256:32");
+ "m:e-e-p:64:64-p7:64:64-G1-ni:7:8:9-p8:128:128:128:48-p9:192:256:256:32");
// Check that SPIR & SPIRV targets don't add -G1 if there is already a -G
// flag.
@@ -198,10 +199,10 @@ TEST(DataLayoutUpgradeTest, EmptyDataLayout) {
EXPECT_EQ(DL2, "e-m:e-p:32:32-i64:64-f80:128-n8:16:32:64-S128");
// Check that AMDGPU targets add G1 if it's not present.
- EXPECT_EQ(UpgradeDataLayoutString("", "r600"), "G1");
+ EXPECT_EQ(UpgradeDataLayoutString("", "r600"), "m:e-G1");
EXPECT_EQ(
UpgradeDataLayoutString("", "amdgcn"),
- "G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32");
+ "m:e-G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32");
// Check that SPIR & SPIRV targets add G1 if it's not present.
EXPECT_EQ(UpgradeDataLayoutString("", "spir"), "G1");
diff --git a/llvm/unittests/CodeGen/GlobalISel/LegalizerInfoTest.cpp b/llvm/unittests/CodeGen/GlobalISel/LegalizerInfoTest.cpp
index 7340f56..04cd66c 100644
--- a/llvm/unittests/CodeGen/GlobalISel/LegalizerInfoTest.cpp
+++ b/llvm/unittests/CodeGen/GlobalISel/LegalizerInfoTest.cpp
@@ -420,12 +420,14 @@ TEST(LegalizerInfoTest, RuleSets) {
// Raw type form
LI.getActionDefinitionsBuilder(G_ADD)
- .fewerElementsIf(typeIs(0, v4s32), changeElementCountTo(0, v2s32))
- .fewerElementsIf(typeIs(0, v8s32), changeElementCountTo(0, s32))
- .fewerElementsIf(typeIs(0, LLT::scalable_vector(4, 16)),
- changeElementCountTo(0, LLT::scalable_vector(2, 16)))
- .fewerElementsIf(typeIs(0, LLT::scalable_vector(8, 16)),
- changeElementCountTo(0, s16));
+ .fewerElementsIf(typeIs(0, v4s32),
+ changeElementCountTo(0, ElementCount::getFixed(2)))
+ .fewerElementsIf(typeIs(0, v8s32),
+ changeElementCountTo(0, ElementCount::getFixed(1)))
+ .fewerElementsIf(typeIs(0, LLT::scalable_vector(4, s16)),
+ changeElementCountTo(0, ElementCount::getScalable(2)))
+ .fewerElementsIf(typeIs(0, LLT::scalable_vector(8, s16)),
+ changeElementCountTo(0, ElementCount::getFixed(1)));
LegacyInfo.computeTables();
diff --git a/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp b/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp
index 33f53de..d560073 100644
--- a/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp
+++ b/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp
@@ -4899,3 +4899,189 @@ TEST(GSYMTest, TestLookupsOfOverlappingAndUnequalRanges) {
for (const auto &Line : ExpectedDumpLines)
EXPECT_TRUE(DumpStr.find(Line) != std::string::npos);
}
+
+TEST(GSYMTest, TestUnableToLocateDWO) {
+ // Test that llvm-gsymutil will not produce "uanble to locate DWO file" for
+ // Apple binaries. Apple uses DW_AT_GNU_dwo_id for non split DWARF purposes
+ // and this makes llvm-gsymutil create warnings and errors.
+ //
+ // 0x0000000b: DW_TAG_compile_unit
+ // DW_AT_name ("main.cpp")
+ // DW_AT_language (DW_LANG_C)
+ // DW_AT_GNU_dwo_id (0xfffffffe)
+ StringRef yamldata = R"(
+ debug_str:
+ - ''
+ - main.cpp
+ debug_abbrev:
+ - ID: 0
+ Table:
+ - Code: 0x1
+ Tag: DW_TAG_compile_unit
+ Children: DW_CHILDREN_no
+ Attributes:
+ - Attribute: DW_AT_name
+ Form: DW_FORM_strp
+ - Attribute: DW_AT_language
+ Form: DW_FORM_udata
+ - Attribute: DW_AT_GNU_dwo_id
+ Form: DW_FORM_data4
+ debug_info:
+ - Length: 0x11
+ Version: 4
+ AbbrevTableID: 0
+ AbbrOffset: 0x0
+ AddrSize: 8
+ Entries:
+ - AbbrCode: 0x1
+ Values:
+ - Value: 0x1
+ - Value: 0x2
+ - Value: 0xFFFFFFFE
+ )";
+ auto ErrOrSections = DWARFYAML::emitDebugSections(yamldata);
+ ASSERT_THAT_EXPECTED(ErrOrSections, Succeeded());
+ std::unique_ptr<DWARFContext> DwarfContext =
+ DWARFContext::create(*ErrOrSections, 8);
+ ASSERT_TRUE(DwarfContext.get() != nullptr);
+ std::string errors;
+ raw_string_ostream OS(errors);
+ OutputAggregator OSAgg(&OS);
+ GsymCreator GC;
+ // Make a DWARF transformer that is MachO (Apple) to avoid warnings about
+ // not finding DWO files.
+ DwarfTransformer DT(*DwarfContext, GC, /*LDCS=*/false, /*MachO*/ true);
+ const uint32_t ThreadCount = 1;
+ ASSERT_THAT_ERROR(DT.convert(ThreadCount, OSAgg), Succeeded());
+ ASSERT_THAT_ERROR(GC.finalize(OSAgg), Succeeded());
+
+ // Make sure this warning is not in the binary
+ std::string warn("warning: Unable to retrieve DWO .debug_info section for");
+ EXPECT_TRUE(errors.find(warn) == std::string::npos);
+}
+
+TEST(GSYMTest, TestDWARFTransformNoErrorForMissingFileDecl) {
+ // Test that if llvm-gsymutil finds a line table for a compile unit and if
+ // there are no matching entries for a function in that compile unit, that
+ // it doesn't print out a error saying that a DIE has an invalid file index
+ // if there is no DW_AT_decl_file attribute.
+ //
+ // 0x0000000b: DW_TAG_compile_unit
+ // DW_AT_name ("main.cpp")
+ // DW_AT_language (DW_LANG_C)
+ // DW_AT_stmt_list (0x00000000)
+ //
+ // 0x00000015: DW_TAG_subprogram
+ // DW_AT_name ("foo")
+ // DW_AT_low_pc (0x0000000000001000)
+ // DW_AT_high_pc (0x0000000000001050)
+ //
+ // 0x0000002a: NULL
+ //
+ // Line table that has entries, but none that match "foo":
+ //
+ // Address Line Column File ISA Discriminator OpIndex Flags
+ // ------------------ ------ ------ ------ --- ------------- ------- -----
+ // 0x0000000000002000 10 0 1 0 0 0 is_stmt
+ // 0x0000000000002050 13 0 1 0 0 0 is_stmt
+
+ StringRef yamldata = R"(
+ debug_str:
+ - ''
+ - main.cpp
+ debug_abbrev:
+ - ID: 0
+ Table:
+ - Code: 0x1
+ Tag: DW_TAG_compile_unit
+ Children: DW_CHILDREN_yes
+ Attributes:
+ - Attribute: DW_AT_name
+ Form: DW_FORM_strp
+ - Attribute: DW_AT_language
+ Form: DW_FORM_udata
+ - Attribute: DW_AT_stmt_list
+ Form: DW_FORM_sec_offset
+ - Code: 0x2
+ Tag: DW_TAG_subprogram
+ Children: DW_CHILDREN_no
+ Attributes:
+ - Attribute: DW_AT_name
+ Form: DW_FORM_string
+ - Attribute: DW_AT_low_pc
+ Form: DW_FORM_addr
+ - Attribute: DW_AT_high_pc
+ Form: DW_FORM_addr
+ debug_info:
+ - Length: 0x27
+ Version: 4
+ AbbrevTableID: 0
+ AbbrOffset: 0x0
+ AddrSize: 8
+ Entries:
+ - AbbrCode: 0x1
+ Values:
+ - Value: 0x1
+ - Value: 0x2
+ - Value: 0x0
+ - AbbrCode: 0x2
+ Values:
+ - Value: 0xDEADBEEFDEADBEEF
+ CStr: foo
+ - Value: 0x1000
+ - Value: 0x1050
+ - AbbrCode: 0x0
+ debug_line:
+ - Length: 58
+ Version: 2
+ PrologueLength: 31
+ MinInstLength: 1
+ DefaultIsStmt: 1
+ LineBase: 251
+ LineRange: 14
+ OpcodeBase: 13
+ StandardOpcodeLengths: [ 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1 ]
+ Files:
+ - Name: main.cpp
+ DirIdx: 0
+ ModTime: 0
+ Length: 0
+ Opcodes:
+ - Opcode: DW_LNS_extended_op
+ ExtLen: 9
+ SubOpcode: DW_LNE_set_address
+ Data: 8192
+ - Opcode: DW_LNS_advance_line
+ SData: 9
+ Data: 0
+ - Opcode: DW_LNS_copy
+ Data: 0
+ - Opcode: DW_LNS_advance_pc
+ Data: 80
+ - Opcode: DW_LNS_advance_line
+ SData: 3
+ Data: 0
+ - Opcode: DW_LNS_extended_op
+ ExtLen: 1
+ SubOpcode: DW_LNE_end_sequence
+ Data: 0
+ )";
+ auto ErrOrSections = DWARFYAML::emitDebugSections(yamldata);
+ ASSERT_THAT_EXPECTED(ErrOrSections, Succeeded());
+ std::unique_ptr<DWARFContext> DwarfContext =
+ DWARFContext::create(*ErrOrSections, 8);
+ ASSERT_TRUE(DwarfContext.get() != nullptr);
+ std::string errors;
+ raw_string_ostream OS(errors);
+ OutputAggregator OSAgg(&OS);
+ GsymCreator GC;
+ DwarfTransformer DT(*DwarfContext, GC);
+ const uint32_t ThreadCount = 1;
+ ASSERT_THAT_ERROR(DT.convert(ThreadCount, OSAgg), Succeeded());
+ ASSERT_THAT_ERROR(GC.finalize(OSAgg), Succeeded());
+
+ // Make sure this warning is not in the binary
+ std::string error_str("error: function DIE at 0x00000015 has an invalid file "
+ "index 4294967295 in its DW_AT_decl_file attribute");
+ EXPECT_TRUE(errors.find(error_str) == std::string::npos);
+}
diff --git a/llvm/unittests/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManagerTest.cpp b/llvm/unittests/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManagerTest.cpp
index d4b45ea..2c6650d 100644
--- a/llvm/unittests/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManagerTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManagerTest.cpp
@@ -39,8 +39,11 @@ public:
return ExecutorAddr::fromPtr(MB.base());
}
- Error finalize(tpctypes::FinalizeRequest FR) {
+ Expected<ExecutorAddr> initialize(tpctypes::FinalizeRequest FR) {
+ assert(!FR.Segments.empty());
+ ExecutorAddr Base = FR.Segments[0].Addr;
for (auto &Seg : FR.Segments) {
+ Base = std::min(Base, Seg.Addr);
char *Mem = Seg.Addr.toPtr<char *>();
memcpy(Mem, Seg.Content.data(), Seg.Content.size());
memset(Mem + Seg.Content.size(), 0, Seg.Size - Seg.Content.size());
@@ -52,10 +55,10 @@ public:
if ((Seg.RAG.Prot & MemProt::Exec) != MemProt::Exec)
sys::Memory::InvalidateInstructionCache(Mem, Seg.Size);
}
- return Error::success();
+ return Base;
}
- Error deallocate(std::vector<ExecutorAddr> &Bases) {
+ Error release(std::vector<ExecutorAddr> &Bases) {
Error Err = Error::success();
for (auto &Base : Bases) {
auto I = Blocks.find(Base.toPtr<void *>());
@@ -86,18 +89,18 @@ CWrapperFunctionResult testReserve(const char *ArgData, size_t ArgSize) {
.release();
}
-CWrapperFunctionResult testFinalize(const char *ArgData, size_t ArgSize) {
- return WrapperFunction<rt::SPSSimpleExecutorMemoryManagerFinalizeSignature>::
+CWrapperFunctionResult testInitialize(const char *ArgData, size_t ArgSize) {
+ return WrapperFunction<
+ rt::SPSSimpleExecutorMemoryManagerInitializeSignature>::
handle(ArgData, ArgSize,
- makeMethodWrapperHandler(&SimpleAllocator::finalize))
+ makeMethodWrapperHandler(&SimpleAllocator::initialize))
.release();
}
-CWrapperFunctionResult testDeallocate(const char *ArgData, size_t ArgSize) {
- return WrapperFunction<
- rt::SPSSimpleExecutorMemoryManagerDeallocateSignature>::
+CWrapperFunctionResult testRelease(const char *ArgData, size_t ArgSize) {
+ return WrapperFunction<rt::SPSSimpleExecutorMemoryManagerReleaseSignature>::
handle(ArgData, ArgSize,
- makeMethodWrapperHandler(&SimpleAllocator::deallocate))
+ makeMethodWrapperHandler(&SimpleAllocator::release))
.release();
}
@@ -108,8 +111,8 @@ TEST(EPCGenericJITLinkMemoryManagerTest, AllocFinalizeFree) {
EPCGenericJITLinkMemoryManager::SymbolAddrs SAs;
SAs.Allocator = ExecutorAddr::fromPtr(&SA);
SAs.Reserve = ExecutorAddr::fromPtr(&testReserve);
- SAs.Finalize = ExecutorAddr::fromPtr(&testFinalize);
- SAs.Deallocate = ExecutorAddr::fromPtr(&testDeallocate);
+ SAs.Initialize = ExecutorAddr::fromPtr(&testInitialize);
+ SAs.Release = ExecutorAddr::fromPtr(&testRelease);
auto MemMgr = std::make_unique<EPCGenericJITLinkMemoryManager>(*SelfEPC, SAs);
StringRef Hello = "hello";
diff --git a/llvm/unittests/ExecutionEngine/Orc/ExecutorAddressTest.cpp b/llvm/unittests/ExecutionEngine/Orc/ExecutorAddressTest.cpp
index ae9db14..9a37980 100644
--- a/llvm/unittests/ExecutionEngine/Orc/ExecutorAddressTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/ExecutorAddressTest.cpp
@@ -97,10 +97,16 @@ TEST(ExecutorAddrTest, AddrRanges) {
EXPECT_FALSE(R1.contains(A0));
EXPECT_FALSE(R1.contains(A2));
+ EXPECT_TRUE(R3.contains(R0)); // True for singleton range at start.
+ EXPECT_TRUE(R3.contains(R1)); // True for singleton range at end.
+ EXPECT_FALSE(R3.contains(R2)); // False for non-overlaping singleton range.
+ EXPECT_FALSE(R3.contains(R4)); // False for overlapping, uncontained range.
+
EXPECT_FALSE(R1.overlaps(R0));
EXPECT_FALSE(R1.overlaps(R2));
EXPECT_TRUE(R1.overlaps(R3));
EXPECT_TRUE(R1.overlaps(R4));
+ EXPECT_TRUE(R3.overlaps(R4));
EXPECT_LE(R0, R0);
EXPECT_LT(R0, R1);
diff --git a/llvm/unittests/ExecutionEngine/Orc/MapperJITLinkMemoryManagerTest.cpp b/llvm/unittests/ExecutionEngine/Orc/MapperJITLinkMemoryManagerTest.cpp
index c5e9d43..a5269f7 100644
--- a/llvm/unittests/ExecutionEngine/Orc/MapperJITLinkMemoryManagerTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/MapperJITLinkMemoryManagerTest.cpp
@@ -39,8 +39,8 @@ public:
return Mapper->initialize(AI, std::move(OnInitialized));
}
- char *prepare(ExecutorAddr Addr, size_t ContentSize) override {
- return Mapper->prepare(Addr, ContentSize);
+ char *prepare(LinkGraph &G, ExecutorAddr Addr, size_t ContentSize) override {
+ return Mapper->prepare(G, Addr, ContentSize);
}
void deinitialize(ArrayRef<ExecutorAddr> Allocations,
diff --git a/llvm/unittests/ExecutionEngine/Orc/MemoryMapperTest.cpp b/llvm/unittests/ExecutionEngine/Orc/MemoryMapperTest.cpp
index fea9eab..1174493 100644
--- a/llvm/unittests/ExecutionEngine/Orc/MemoryMapperTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/MemoryMapperTest.cpp
@@ -7,6 +7,7 @@
//===----------------------------------------------------------------------===//
#include "llvm/ExecutionEngine/Orc/MemoryMapper.h"
+#include "llvm/ExecutionEngine/JITLink/JITLink.h"
#include "llvm/Support/Process.h"
#include "llvm/Testing/Support/Error.h"
#include "gtest/gtest.h"
@@ -66,6 +67,9 @@ TEST(MemoryMapperTest, InitializeDeinitialize) {
{
std::unique_ptr<MemoryMapper> Mapper =
cantFail(InProcessMemoryMapper::Create());
+ jitlink::LinkGraph G("G", std::make_shared<SymbolStringPool>(),
+ Triple("x86_64-apple-darwin"), SubtargetFeatures(),
+ jitlink::getGenericEdgeKindName);
// We will do two separate allocations
auto PageSize = Mapper->getPageSize();
@@ -80,7 +84,7 @@ TEST(MemoryMapperTest, InitializeDeinitialize) {
{
// Provide working memory
- char *WA1 = Mapper->prepare(Mem1->Start, HW.size() + 1);
+ char *WA1 = Mapper->prepare(G, Mem1->Start, HW.size() + 1);
std::strcpy(WA1, HW.c_str());
}
@@ -105,7 +109,7 @@ TEST(MemoryMapperTest, InitializeDeinitialize) {
}
{
- char *WA2 = Mapper->prepare(Mem1->Start + PageSize, HW.size() + 1);
+ char *WA2 = Mapper->prepare(G, Mem1->Start + PageSize, HW.size() + 1);
std::strcpy(WA2, HW.c_str());
}
@@ -158,7 +162,7 @@ TEST(MemoryMapperTest, InitializeDeinitialize) {
auto Mem2 = reserve(*Mapper, PageSize);
EXPECT_THAT_ERROR(Mem2.takeError(), Succeeded());
- char *WA = Mapper->prepare(Mem2->Start, HW.size() + 1);
+ char *WA = Mapper->prepare(G, Mem2->Start, HW.size() + 1);
std::strcpy(WA, HW.c_str());
MemoryMapper::AllocInfo Alloc3;
diff --git a/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp b/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp
index cd10ffe..686d85d 100644
--- a/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp
@@ -9,6 +9,7 @@
#include "llvm/ExecutionEngine/Orc/IRTransformLayer.h"
#include "llvm/ExecutionEngine/Orc/JITLinkRedirectableSymbolManager.h"
#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
+#include "llvm/ExecutionEngine/Orc/MapperJITLinkMemoryManager.h"
#include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
#include "llvm/ExecutionEngine/Orc/ObjectTransformLayer.h"
#include "llvm/ExecutionEngine/Orc/SelfExecutorProcessControl.h"
@@ -42,7 +43,7 @@ protected:
// COFF-ARM64 is not supported yet
auto Triple = JTMB->getTargetTriple();
- if (Triple.isOSBinFormatCOFF() && Triple.isAArch64())
+ if (Triple.isOSBinFormatCOFF())
GTEST_SKIP();
// SystemZ is not supported yet.
@@ -84,8 +85,11 @@ protected:
ES = std::make_unique<ExecutionSession>(std::move(*EPC));
JD = &ES->createBareJITDylib("main");
+
ObjLinkingLayer = std::make_unique<ObjectLinkingLayer>(
- *ES, std::make_unique<InProcessMemoryManager>(*PageSize));
+ *ES, std::make_unique<MapperJITLinkMemoryManager>(
+ 10 * 1024 * 1024,
+ std::make_unique<InProcessMemoryMapper>(*PageSize)));
DL = std::make_unique<DataLayout>(std::move(*DLOrErr));
auto TM = JTMB->createTargetMachine();
diff --git a/llvm/unittests/ExecutionEngine/Orc/SharedMemoryMapperTest.cpp b/llvm/unittests/ExecutionEngine/Orc/SharedMemoryMapperTest.cpp
index 700500f..7775f3c 100644
--- a/llvm/unittests/ExecutionEngine/Orc/SharedMemoryMapperTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/SharedMemoryMapperTest.cpp
@@ -8,6 +8,7 @@
#include "OrcTestCommon.h"
#include "llvm/Config/llvm-config.h" // for LLVM_ON_UNIX
+#include "llvm/ExecutionEngine/JITLink/JITLink.h"
#include "llvm/ExecutionEngine/Orc/MemoryMapper.h"
#include "llvm/ExecutionEngine/Orc/SelfExecutorProcessControl.h"
#include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
@@ -67,12 +68,16 @@ TEST(SharedMemoryMapperTest, MemReserveInitializeDeinitializeRelease) {
auto PageSize = Mapper->getPageSize();
size_t ReqSize = PageSize;
+ jitlink::LinkGraph G("G", std::make_shared<SymbolStringPool>(),
+ Triple("x86_64-apple-darwin"), SubtargetFeatures(),
+ jitlink::getGenericEdgeKindName);
Mapper->reserve(ReqSize, [&](Expected<ExecutorAddrRange> Result) {
EXPECT_THAT_ERROR(Result.takeError(), Succeeded());
auto Reservation = std::move(*Result);
{
- char *Addr = Mapper->prepare(Reservation.Start, TestString.size() + 1);
+ char *Addr =
+ Mapper->prepare(G, Reservation.Start, TestString.size() + 1);
std::strcpy(Addr, TestString.c_str());
}
MemoryMapper::AllocInfo AI;
diff --git a/llvm/unittests/ExecutionEngine/Orc/SimpleExecutorMemoryManagerTest.cpp b/llvm/unittests/ExecutionEngine/Orc/SimpleExecutorMemoryManagerTest.cpp
index 6e9b0b2..9c6f19c 100644
--- a/llvm/unittests/ExecutionEngine/Orc/SimpleExecutorMemoryManagerTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/SimpleExecutorMemoryManagerTest.cpp
@@ -34,12 +34,12 @@ TEST(SimpleExecutorMemoryManagerTest, AllocFinalizeFree) {
SimpleExecutorMemoryManager MemMgr;
constexpr unsigned AllocSize = 16384;
- auto Mem = MemMgr.allocate(AllocSize);
+ auto Mem = MemMgr.reserve(AllocSize);
EXPECT_THAT_ERROR(Mem.takeError(), Succeeded());
std::string HW = "Hello, world!";
- int FinalizeCounter = 0;
+ int InitializeCounter = 0;
int DeallocateCounter = 0;
tpctypes::FinalizeRequest FR;
@@ -52,27 +52,27 @@ TEST(SimpleExecutorMemoryManagerTest, AllocFinalizeFree) {
{/* Finalize: */
cantFail(WrapperFunctionCall::Create<SPSArgList<SPSExecutorAddr>>(
ExecutorAddr::fromPtr(incrementWrapper),
- ExecutorAddr::fromPtr(&FinalizeCounter))),
+ ExecutorAddr::fromPtr(&InitializeCounter))),
/* Deallocate: */
cantFail(WrapperFunctionCall::Create<SPSArgList<SPSExecutorAddr>>(
ExecutorAddr::fromPtr(incrementWrapper),
ExecutorAddr::fromPtr(&DeallocateCounter)))});
- EXPECT_EQ(FinalizeCounter, 0);
+ EXPECT_EQ(InitializeCounter, 0);
EXPECT_EQ(DeallocateCounter, 0);
- auto FinalizeErr = MemMgr.finalize(FR);
- EXPECT_THAT_ERROR(std::move(FinalizeErr), Succeeded());
+ auto InitializeErr = MemMgr.initialize(FR);
+ EXPECT_THAT_EXPECTED(std::move(InitializeErr), Succeeded());
- EXPECT_EQ(FinalizeCounter, 1);
+ EXPECT_EQ(InitializeCounter, 1);
EXPECT_EQ(DeallocateCounter, 0);
EXPECT_EQ(HW, std::string(Mem->toPtr<const char *>()));
- auto DeallocateErr = MemMgr.deallocate({*Mem});
- EXPECT_THAT_ERROR(std::move(DeallocateErr), Succeeded());
+ auto ReleaseErr = MemMgr.release({*Mem});
+ EXPECT_THAT_ERROR(std::move(ReleaseErr), Succeeded());
- EXPECT_EQ(FinalizeCounter, 1);
+ EXPECT_EQ(InitializeCounter, 1);
EXPECT_EQ(DeallocateCounter, 1);
}
diff --git a/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp b/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp
index 95c26b1..a8706ce 100644
--- a/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp
@@ -431,8 +431,8 @@ TEST_F(OpenMPDecompositionTest, Firstprivate3) {
std::string Dir0 = stringify(Dec.output[0]);
std::string Dir1 = stringify(Dec.output[1]);
std::string Dir2 = stringify(Dec.output[2]);
- ASSERT_EQ(Dir0, "target map(2, , , , , (x))"); // (12), (27)
- ASSERT_EQ(Dir1, "teams shared(x)"); // (6), (17)
+ ASSERT_EQ(Dir0, "target map(2, , , , , , (x))"); // (12), (27)
+ ASSERT_EQ(Dir1, "teams shared(x)"); // (6), (17)
ASSERT_EQ(Dir2, "distribute firstprivate(x) lastprivate(, (x))"); // (5), (21)
}
@@ -574,9 +574,9 @@ TEST_F(OpenMPDecompositionTest, Lastprivate3) {
std::string Dir0 = stringify(Dec.output[0]);
std::string Dir1 = stringify(Dec.output[1]);
std::string Dir2 = stringify(Dec.output[2]);
- ASSERT_EQ(Dir0, "target map(2, , , , , (x))"); // (21), (27)
- ASSERT_EQ(Dir1, "parallel shared(x)"); // (22)
- ASSERT_EQ(Dir2, "do lastprivate(, (x))"); // (21)
+ ASSERT_EQ(Dir0, "target map(2, , , , , , (x))"); // (21), (27)
+ ASSERT_EQ(Dir1, "parallel shared(x)"); // (22)
+ ASSERT_EQ(Dir2, "do lastprivate(, (x))"); // (21)
}
// SHARED
@@ -984,9 +984,9 @@ TEST_F(OpenMPDecompositionTest, Reduction7) {
std::string Dir0 = stringify(Dec.output[0]);
std::string Dir1 = stringify(Dec.output[1]);
std::string Dir2 = stringify(Dec.output[2]);
- ASSERT_EQ(Dir0, "target map(2, , , , , (x))"); // (36), (10)
- ASSERT_EQ(Dir1, "parallel shared(x)"); // (36), (1), (4)
- ASSERT_EQ(Dir2, "do reduction(, (3), (x))"); // (36)
+ ASSERT_EQ(Dir0, "target map(2, , , , , , (x))"); // (36), (10)
+ ASSERT_EQ(Dir1, "parallel shared(x)"); // (36), (1), (4)
+ ASSERT_EQ(Dir2, "do reduction(, (3), (x))"); // (36)
}
// IF
diff --git a/llvm/unittests/IR/ConstantFPRangeTest.cpp b/llvm/unittests/IR/ConstantFPRangeTest.cpp
index cf9b31c..67fee96 100644
--- a/llvm/unittests/IR/ConstantFPRangeTest.cpp
+++ b/llvm/unittests/IR/ConstantFPRangeTest.cpp
@@ -8,6 +8,7 @@
#include "llvm/IR/ConstantFPRange.h"
#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/FloatingPointMode.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Operator.h"
#include "gtest/gtest.h"
@@ -1065,4 +1066,179 @@ TEST_F(ConstantFPRangeTest, sub) {
#endif
}
+TEST_F(ConstantFPRangeTest, mul) {
+ EXPECT_EQ(Full.mul(Full), NonNaN.unionWith(QNaN));
+ EXPECT_EQ(Full.mul(Empty), Empty);
+ EXPECT_EQ(Empty.mul(Full), Empty);
+ EXPECT_EQ(Empty.mul(Empty), Empty);
+ EXPECT_EQ(One.mul(One), ConstantFPRange(APFloat(1.0)));
+ EXPECT_EQ(Some.mul(Some),
+ ConstantFPRange::getNonNaN(APFloat(-9.0), APFloat(9.0)));
+ EXPECT_EQ(SomePos.mul(SomeNeg),
+ ConstantFPRange::getNonNaN(APFloat(-9.0), APFloat(-0.0)));
+ EXPECT_EQ(PosInf.mul(PosInf), PosInf);
+ EXPECT_EQ(NegInf.mul(NegInf), PosInf);
+ EXPECT_EQ(PosInf.mul(Finite), NonNaN.unionWith(QNaN));
+ EXPECT_EQ(NegInf.mul(Finite), NonNaN.unionWith(QNaN));
+ EXPECT_EQ(PosInf.mul(NegInf), NegInf);
+ EXPECT_EQ(NegInf.mul(PosInf), NegInf);
+ EXPECT_EQ(PosZero.mul(NegZero), NegZero);
+ EXPECT_EQ(PosZero.mul(Zero), Zero);
+ EXPECT_EQ(NegZero.mul(NegZero), PosZero);
+ EXPECT_EQ(NegZero.mul(Zero), Zero);
+ EXPECT_EQ(NaN.mul(NaN), QNaN);
+ EXPECT_EQ(NaN.mul(Finite), QNaN);
+
+#if defined(EXPENSIVE_CHECKS)
+ EnumerateTwoInterestingConstantFPRanges(
+ [](const ConstantFPRange &LHS, const ConstantFPRange &RHS) {
+ ConstantFPRange Res = LHS.mul(RHS);
+ ConstantFPRange Expected =
+ ConstantFPRange::getEmpty(LHS.getSemantics());
+ EnumerateValuesInConstantFPRange(
+ LHS,
+ [&](const APFloat &LHSC) {
+ EnumerateValuesInConstantFPRange(
+ RHS,
+ [&](const APFloat &RHSC) {
+ APFloat Prod = LHSC * RHSC;
+ EXPECT_TRUE(Res.contains(Prod))
+ << "Wrong result for " << LHS << " * " << RHS
+ << ". The result " << Res << " should contain " << Prod;
+ if (!Expected.contains(Prod))
+ Expected = Expected.unionWith(ConstantFPRange(Prod));
+ },
+ /*IgnoreNaNPayload=*/true);
+ },
+ /*IgnoreNaNPayload=*/true);
+ EXPECT_EQ(Res, Expected)
+ << "Suboptimal result for " << LHS << " * " << RHS << ". Expected "
+ << Expected << ", but got " << Res;
+ },
+ SparseLevel::SpecialValuesOnly);
+#endif
+}
+
+TEST_F(ConstantFPRangeTest, div) {
+ EXPECT_EQ(Full.div(Full), NonNaN.unionWith(QNaN));
+ EXPECT_EQ(Full.div(Empty), Empty);
+ EXPECT_EQ(Empty.div(Full), Empty);
+ EXPECT_EQ(Empty.div(Empty), Empty);
+ EXPECT_EQ(One.div(One), ConstantFPRange(APFloat(1.0)));
+ EXPECT_EQ(Some.div(Some), NonNaN.unionWith(QNaN));
+ EXPECT_EQ(SomePos.div(SomeNeg),
+ ConstantFPRange(APFloat::getInf(Sem, /*Negative=*/true),
+ APFloat::getZero(Sem, /*Negative=*/true),
+ /*MayBeQNaN=*/true, /*MayBeSNaN=*/false));
+ EXPECT_EQ(PosInf.div(PosInf), QNaN);
+ EXPECT_EQ(NegInf.div(NegInf), QNaN);
+ EXPECT_EQ(PosInf.div(Finite), NonNaN);
+ EXPECT_EQ(NegInf.div(Finite), NonNaN);
+ EXPECT_EQ(PosInf.div(NegInf), QNaN);
+ EXPECT_EQ(NegInf.div(PosInf), QNaN);
+ EXPECT_EQ(Zero.div(Zero), QNaN);
+ EXPECT_EQ(SomePos.div(PosInf), PosZero);
+ EXPECT_EQ(SomeNeg.div(PosInf), NegZero);
+ EXPECT_EQ(PosInf.div(SomePos), PosInf);
+ EXPECT_EQ(NegInf.div(SomeNeg), PosInf);
+ EXPECT_EQ(NegInf.div(Some), NonNaN);
+ EXPECT_EQ(NaN.div(NaN), QNaN);
+ EXPECT_EQ(NaN.div(Finite), QNaN);
+
+#if defined(EXPENSIVE_CHECKS)
+ EnumerateTwoInterestingConstantFPRanges(
+ [](const ConstantFPRange &LHS, const ConstantFPRange &RHS) {
+ ConstantFPRange Res = LHS.div(RHS);
+ ConstantFPRange Expected =
+ ConstantFPRange::getEmpty(LHS.getSemantics());
+ EnumerateValuesInConstantFPRange(
+ LHS,
+ [&](const APFloat &LHSC) {
+ EnumerateValuesInConstantFPRange(
+ RHS,
+ [&](const APFloat &RHSC) {
+ APFloat Val = LHSC / RHSC;
+ EXPECT_TRUE(Res.contains(Val))
+ << "Wrong result for " << LHS << " / " << RHS
+ << ". The result " << Res << " should contain " << Val;
+ if (!Expected.contains(Val))
+ Expected = Expected.unionWith(ConstantFPRange(Val));
+ },
+ /*IgnoreNaNPayload=*/true);
+ },
+ /*IgnoreNaNPayload=*/true);
+ EXPECT_EQ(Res, Expected)
+ << "Suboptimal result for " << LHS << " / " << RHS << ". Expected "
+ << Expected << ", but got " << Res;
+ },
+ SparseLevel::SpecialValuesOnly);
+#endif
+}
+
+TEST_F(ConstantFPRangeTest, flushDenormals) {
+ const fltSemantics &FP8Sem = APFloat::Float8E4M3();
+ APFloat NormalVal = APFloat::getSmallestNormalized(FP8Sem);
+ APFloat Subnormal1 = NormalVal;
+ Subnormal1.next(/*nextDown=*/true);
+ APFloat Subnormal2 = APFloat::getSmallest(FP8Sem);
+ APFloat ZeroVal = APFloat::getZero(FP8Sem);
+ APFloat EdgeValues[8] = {-NormalVal, -Subnormal1, -Subnormal2, -ZeroVal,
+ ZeroVal, Subnormal2, Subnormal1, NormalVal};
+ constexpr DenormalMode::DenormalModeKind Modes[4] = {
+ DenormalMode::IEEE, DenormalMode::PreserveSign,
+ DenormalMode::PositiveZero, DenormalMode::Dynamic};
+ for (uint32_t I = 0; I != 8; ++I) {
+ for (uint32_t J = I; J != 8; ++J) {
+ ConstantFPRange OriginCR =
+ ConstantFPRange::getNonNaN(EdgeValues[I], EdgeValues[J]);
+ for (auto Mode : Modes) {
+ StringRef ModeName = denormalModeKindName(Mode);
+ ConstantFPRange FlushedCR = OriginCR;
+ FlushedCR.flushDenormals(Mode);
+
+ ConstantFPRange Expected = ConstantFPRange::getEmpty(FP8Sem);
+ auto CheckFlushedV = [&](const APFloat &V, const APFloat &FlushedV) {
+ EXPECT_TRUE(FlushedCR.contains(FlushedV))
+ << "Wrong result for flushDenormal(" << V << ", " << ModeName
+ << "). The result " << FlushedCR << " should contain "
+ << FlushedV;
+ if (!Expected.contains(FlushedV))
+ Expected = Expected.unionWith(ConstantFPRange(FlushedV));
+ };
+ EnumerateValuesInConstantFPRange(
+ OriginCR,
+ [&](const APFloat &V) {
+ if (V.isDenormal()) {
+ switch (Mode) {
+ case DenormalMode::IEEE:
+ break;
+ case DenormalMode::PreserveSign:
+ CheckFlushedV(V, APFloat::getZero(FP8Sem, V.isNegative()));
+ break;
+ case DenormalMode::PositiveZero:
+ CheckFlushedV(V, APFloat::getZero(FP8Sem));
+ break;
+ case DenormalMode::Dynamic:
+ // PreserveSign
+ CheckFlushedV(V, APFloat::getZero(FP8Sem, V.isNegative()));
+ // PositiveZero
+ CheckFlushedV(V, APFloat::getZero(FP8Sem));
+ break;
+ default:
+ llvm_unreachable("unknown denormal mode");
+ }
+ }
+ // It is not mandated that flushing to zero occurs.
+ CheckFlushedV(V, V);
+ },
+ /*IgnoreNaNPayload=*/true);
+ EXPECT_EQ(FlushedCR, Expected)
+ << "Suboptimal result for flushDenormal(" << OriginCR << ", "
+ << ModeName << "). Expected " << Expected << ", but got "
+ << FlushedCR;
+ }
+ }
+ }
+}
+
} // anonymous namespace
diff --git a/llvm/unittests/IR/ConstantsTest.cpp b/llvm/unittests/IR/ConstantsTest.cpp
index 54c7ddd..6376165 100644
--- a/llvm/unittests/IR/ConstantsTest.cpp
+++ b/llvm/unittests/IR/ConstantsTest.cpp
@@ -564,13 +564,17 @@ TEST(ConstantsTest, FoldGlobalVariablePtr) {
Global->setAlignment(Align(4));
- ConstantInt *TheConstant(ConstantInt::get(IntType, 2));
+ ConstantInt *TheConstant = ConstantInt::get(IntType, 2);
- Constant *TheConstantExpr(ConstantExpr::getPtrToInt(Global.get(), IntType));
+ Constant *PtrToInt = ConstantExpr::getPtrToInt(Global.get(), IntType);
+ ASSERT_TRUE(
+ ConstantFoldBinaryInstruction(Instruction::And, PtrToInt, TheConstant)
+ ->isNullValue());
- ASSERT_TRUE(ConstantFoldBinaryInstruction(Instruction::And, TheConstantExpr,
- TheConstant)
- ->isNullValue());
+ Constant *PtrToAddr = ConstantExpr::getPtrToAddr(Global.get(), IntType);
+ ASSERT_TRUE(
+ ConstantFoldBinaryInstruction(Instruction::And, PtrToAddr, TheConstant)
+ ->isNullValue());
}
// Check that containsUndefOrPoisonElement and containsPoisonElement is working
diff --git a/llvm/unittests/IR/InstructionsTest.cpp b/llvm/unittests/IR/InstructionsTest.cpp
index fe9e7e8..f4693bf 100644
--- a/llvm/unittests/IR/InstructionsTest.cpp
+++ b/llvm/unittests/IR/InstructionsTest.cpp
@@ -606,12 +606,14 @@ TEST(InstructionTest, ConstrainedTrans) {
TEST(InstructionsTest, isEliminableCastPair) {
LLVMContext C;
- DataLayout DL1("p1:32:32");
+ DataLayout DL1("p1:32:32-p2:64:64:64:32");
Type *Int16Ty = Type::getInt16Ty(C);
+ Type *Int32Ty = Type::getInt32Ty(C);
Type *Int64Ty = Type::getInt64Ty(C);
Type *PtrTy64 = PointerType::get(C, 0);
Type *PtrTy32 = PointerType::get(C, 1);
+ Type *PtrTy64_32 = PointerType::get(C, 2);
// Source and destination pointers have same size -> bitcast.
EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::PtrToInt,
@@ -637,6 +639,42 @@ TEST(InstructionsTest, isEliminableCastPair) {
Int64Ty, &DL1),
0U);
+ // Destination larger than source. Pointer type same as destination.
+ EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::IntToPtr,
+ CastInst::PtrToInt, Int16Ty, PtrTy64,
+ Int64Ty, &DL1),
+ CastInst::ZExt);
+
+ // Destination larger than source. Pointer type different from destination.
+ EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::IntToPtr,
+ CastInst::PtrToInt, Int16Ty, PtrTy32,
+ Int64Ty, &DL1),
+ CastInst::ZExt);
+
+ // Destination smaller than source. Pointer type same as source.
+ EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::IntToPtr,
+ CastInst::PtrToInt, Int64Ty, PtrTy64,
+ Int16Ty, &DL1),
+ CastInst::Trunc);
+
+ // Destination smaller than source. Pointer type different from source.
+ EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::IntToPtr,
+ CastInst::PtrToInt, Int64Ty, PtrTy32,
+ Int16Ty, &DL1),
+ CastInst::Trunc);
+
+ // ptrtoaddr with address size != pointer size. Truncating case.
+ EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::IntToPtr,
+ CastInst::PtrToAddr, Int64Ty,
+ PtrTy64_32, Int32Ty, &DL1),
+ CastInst::Trunc);
+
+ // ptrtoaddr with address size != pointer size. Non-truncating case.
+ EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::IntToPtr,
+ CastInst::PtrToAddr, Int32Ty,
+ PtrTy64_32, Int32Ty, &DL1),
+ CastInst::BitCast);
+
// Test that we don't eliminate bitcasts between different address spaces,
// or if we don't have available pointer size information.
DataLayout DL2("e-p:32:32:32-p1:16:16:16-p2:64:64:64-i1:8:8-i8:8:8-i16:16:16"
diff --git a/llvm/unittests/IR/RuntimeLibcallsTest.cpp b/llvm/unittests/IR/RuntimeLibcallsTest.cpp
index 26cb7e3..8925d2b 100644
--- a/llvm/unittests/IR/RuntimeLibcallsTest.cpp
+++ b/llvm/unittests/IR/RuntimeLibcallsTest.cpp
@@ -44,9 +44,9 @@ TEST(RuntimeLibcallsTest, LibcallImplByName) {
RTLIB::RuntimeLibcallsInfo::lookupLibcallImplName("sqrtl");
ASSERT_EQ(size(SquirtleSquad), 3);
auto I = SquirtleSquad.begin();
- EXPECT_EQ(*I++, RTLIB::impl_sqrt_f128);
- EXPECT_EQ(*I++, RTLIB::impl_sqrt_f80);
- EXPECT_EQ(*I++, RTLIB::impl_sqrt_ppcf128);
+ EXPECT_EQ(*I++, RTLIB::impl_sqrtl_f128);
+ EXPECT_EQ(*I++, RTLIB::impl_sqrtl_f80);
+ EXPECT_EQ(*I++, RTLIB::impl_sqrtl_ppcf128);
}
// Last libcall
@@ -54,9 +54,9 @@ TEST(RuntimeLibcallsTest, LibcallImplByName) {
auto Truncs = RTLIB::RuntimeLibcallsInfo::lookupLibcallImplName("truncl");
ASSERT_EQ(size(Truncs), 3);
auto I = Truncs.begin();
- EXPECT_EQ(*I++, RTLIB::impl_trunc_f128);
- EXPECT_EQ(*I++, RTLIB::impl_trunc_f80);
- EXPECT_EQ(*I++, RTLIB::impl_trunc_ppcf128);
+ EXPECT_EQ(*I++, RTLIB::impl_truncl_f128);
+ EXPECT_EQ(*I++, RTLIB::impl_truncl_f80);
+ EXPECT_EQ(*I++, RTLIB::impl_truncl_ppcf128);
}
}
diff --git a/llvm/unittests/Object/ELFObjectFileTest.cpp b/llvm/unittests/Object/ELFObjectFileTest.cpp
index 17d9f50..d6a3ca5 100644
--- a/llvm/unittests/Object/ELFObjectFileTest.cpp
+++ b/llvm/unittests/Object/ELFObjectFileTest.cpp
@@ -531,7 +531,7 @@ Sections:
// Check that we can detect unsupported versions.
SmallString<128> UnsupportedVersionYamlString(CommonYamlString);
UnsupportedVersionYamlString += R"(
- - Version: 4
+ - Version: 5
BBRanges:
- BaseAddress: 0x11111
BBEntries:
@@ -543,7 +543,7 @@ Sections:
{
SCOPED_TRACE("unsupported version");
DoCheck(UnsupportedVersionYamlString,
- "unsupported SHT_LLVM_BB_ADDR_MAP version: 4");
+ "unsupported SHT_LLVM_BB_ADDR_MAP version: 5");
}
SmallString<128> ZeroBBRangesYamlString(CommonYamlString);
@@ -761,14 +761,14 @@ Sections:
BBAddrMap E1 = {
{{0x11111,
- {{1, 0x0, 0x3, {false, true, false, false, false}, {0x1, 0x2}}}}}};
+ {{1, 0x0, 0x3, {false, true, false, false, false}, {0x1, 0x2}, 0}}}}};
BBAddrMap E2 = {
- {{0x22222, {{2, 0x0, 0x2, {false, false, true, false, false}, {}}}},
- {0xFFFFF, {{15, 0xF0, 0xF1, {true, true, true, true, true}, {}}}}}};
+ {{0x22222, {{2, 0x0, 0x2, {false, false, true, false, false}, {}, 0}}},
+ {0xFFFFF, {{15, 0xF0, 0xF1, {true, true, true, true, true}, {}, 0}}}}};
BBAddrMap E3 = {
- {{0x33333, {{0, 0x0, 0x3, {false, true, true, false, false}, {}}}}}};
+ {{0x33333, {{0, 0x0, 0x3, {false, true, true, false, false}, {}, 0}}}}};
BBAddrMap E4 = {
- {{0x44444, {{0, 0x0, 0x4, {false, false, false, true, true}, {}}}}}};
+ {{0x44444, {{0, 0x0, 0x4, {false, false, false, true, true}, {}, 0}}}}};
std::vector<BBAddrMap> Section0BBAddrMaps = {E4};
std::vector<BBAddrMap> Section1BBAddrMaps = {E3};
@@ -988,6 +988,123 @@ Sections:
}
}
+// Test for the ELFObjectFile::readBBAddrMap API with BBHash.
+TEST(ELFObjectFileTest, ReadBBHash) {
+ StringRef CommonYamlString(R"(
+--- !ELF
+FileHeader:
+ Class: ELFCLASS64
+ Data: ELFDATA2LSB
+ Type: ET_EXEC
+Sections:
+ - Name: .llvm_bb_addr_map_1
+ Type: SHT_LLVM_BB_ADDR_MAP
+ Link: 1
+ Entries:
+ - Version: 4
+ Feature: 0x60
+ BBRanges:
+ - BaseAddress: 0x11111
+ BBEntries:
+ - ID: 1
+ AddressOffset: 0x0
+ Size: 0x1
+ Metadata: 0x2
+ CallsiteEndOffsets: [ 0x1 , 0x1 ]
+ Hash: 0x1
+ - Name: .llvm_bb_addr_map_2
+ Type: SHT_LLVM_BB_ADDR_MAP
+ Link: 1
+ Entries:
+ - Version: 4
+ Feature: 0x48
+ BBRanges:
+ - BaseAddress: 0x22222
+ BBEntries:
+ - ID: 2
+ AddressOffset: 0x0
+ Size: 0x2
+ Metadata: 0x4
+ Hash: 0x2
+ - BaseAddress: 0xFFFFF
+ BBEntries:
+ - ID: 15
+ AddressOffset: 0xF0
+ Size: 0xF1
+ Metadata: 0x1F
+ Hash: 0xF
+ - Name: .llvm_bb_addr_map_3
+ Type: SHT_LLVM_BB_ADDR_MAP
+ Link: 2
+ Entries:
+ - Version: 4
+ Feature: 0x40
+ BBRanges:
+ - BaseAddress: 0x33333
+ BBEntries:
+ - ID: 0
+ AddressOffset: 0x0
+ Size: 0x3
+ Metadata: 0x6
+ Hash: 0x3
+ - Name: .llvm_bb_addr_map_4
+ Type: SHT_LLVM_BB_ADDR_MAP
+ # Link: 0 (by default, can be overriden)
+ Entries:
+ - Version: 4
+ Feature: 0x40
+ BBRanges:
+ - BaseAddress: 0x44444
+ BBEntries:
+ - ID: 0
+ AddressOffset: 0x0
+ Size: 0x4
+ Metadata: 0x18
+ Hash: 0x4
+)");
+
+ BBAddrMap E1 = {
+ {{0x11111,
+ {{1, 0x0, 0x3, {false, true, false, false, false}, {0x1, 0x2}, 0x1}}}}};
+ BBAddrMap E2 = {
+ {{0x22222, {{2, 0x0, 0x2, {false, false, true, false, false}, {}, 0x2}}},
+ {0xFFFFF, {{15, 0xF0, 0xF1, {true, true, true, true, true}, {}, 0xF}}}}};
+ BBAddrMap E3 = {
+ {{0x33333, {{0, 0x0, 0x3, {false, true, true, false, false}, {}, 0x3}}}}};
+ BBAddrMap E4 = {
+ {{0x44444, {{0, 0x0, 0x4, {false, false, false, true, true}, {}, 0x4}}}}};
+
+ std::vector<BBAddrMap> Section0BBAddrMaps = {E4};
+ std::vector<BBAddrMap> Section1BBAddrMaps = {E3};
+ std::vector<BBAddrMap> Section2BBAddrMaps = {E1, E2};
+ std::vector<BBAddrMap> AllBBAddrMaps = {E1, E2, E3, E4};
+
+ auto DoCheckSucceeds = [&](StringRef YamlString,
+ std::optional<unsigned> TextSectionIndex,
+ std::vector<BBAddrMap> ExpectedResult) {
+ SCOPED_TRACE("for TextSectionIndex: " +
+ (TextSectionIndex ? llvm::Twine(*TextSectionIndex) : "{}") +
+ " and object yaml:\n" + YamlString);
+ SmallString<0> Storage;
+ Expected<ELFObjectFile<ELF64LE>> ElfOrErr =
+ toBinary<ELF64LE>(Storage, YamlString);
+ ASSERT_THAT_EXPECTED(ElfOrErr, Succeeded());
+
+ Expected<const typename ELF64LE::Shdr *> BBAddrMapSecOrErr =
+ ElfOrErr->getELFFile().getSection(1);
+ ASSERT_THAT_EXPECTED(BBAddrMapSecOrErr, Succeeded());
+ auto BBAddrMaps = ElfOrErr->readBBAddrMap(TextSectionIndex);
+ ASSERT_THAT_EXPECTED(BBAddrMaps, Succeeded());
+ EXPECT_EQ(*BBAddrMaps, ExpectedResult);
+ };
+
+ DoCheckSucceeds(CommonYamlString, /*TextSectionIndex=*/std::nullopt,
+ AllBBAddrMaps);
+ DoCheckSucceeds(CommonYamlString, /*TextSectionIndex=*/0, Section0BBAddrMaps);
+ DoCheckSucceeds(CommonYamlString, /*TextSectionIndex=*/2, Section1BBAddrMaps);
+ DoCheckSucceeds(CommonYamlString, /*TextSectionIndex=*/1, Section2BBAddrMaps);
+}
+
// Test for the ELFObjectFile::readBBAddrMap API with PGOAnalysisMap.
TEST(ELFObjectFileTest, ReadPGOAnalysisMap) {
StringRef CommonYamlString(R"(
@@ -1159,29 +1276,32 @@ Sections:
)");
BBAddrMap E1 = {
- {{0x11111, {{1, 0x0, 0x1, {false, true, false, false, false}, {}}}}}};
- PGOAnalysisMap P1 = {892, {}, {true, false, false, false, false, false}};
+ {{0x11111, {{1, 0x0, 0x1, {false, true, false, false, false}, {}, 0}}}}};
+ PGOAnalysisMap P1 = {
+ 892, {}, {true, false, false, false, false, false, false}};
BBAddrMap E2 = {
- {{0x22222, {{2, 0x0, 0x2, {false, false, true, false, false}, {}}}}}};
+ {{0x22222, {{2, 0x0, 0x2, {false, false, true, false, false}, {}, 0}}}}};
PGOAnalysisMap P2 = {{},
{{BlockFrequency(343), {}}},
- {false, true, false, false, false, false}};
- BBAddrMap E3 = {{{0x33333,
- {{0, 0x0, 0x3, {false, true, true, false, false}, {}},
- {1, 0x3, 0x3, {false, false, true, false, false}, {}},
- {2, 0x6, 0x3, {false, false, false, false, false}, {}}}}}};
+ {false, true, false, false, false, false, false}};
+ BBAddrMap E3 = {
+ {{0x33333,
+ {{0, 0x0, 0x3, {false, true, true, false, false}, {}, 0},
+ {1, 0x3, 0x3, {false, false, true, false, false}, {}, 0},
+ {2, 0x6, 0x3, {false, false, false, false, false}, {}, 0}}}}};
PGOAnalysisMap P3 = {{},
{{{},
{{1, BranchProbability::getRaw(0x1111'1111)},
{2, BranchProbability::getRaw(0xeeee'eeee)}}},
{{}, {{2, BranchProbability::getRaw(0xffff'ffff)}}},
{{}, {}}},
- {false, false, true, false, false, false}};
- BBAddrMap E4 = {{{0x44444,
- {{0, 0x0, 0x4, {false, false, false, true, true}, {}},
- {1, 0x4, 0x4, {false, false, false, false, false}, {}},
- {2, 0x8, 0x4, {false, false, false, false, false}, {}},
- {3, 0xc, 0x4, {false, false, false, false, false}, {}}}}}};
+ {false, false, true, false, false, false, false}};
+ BBAddrMap E4 = {
+ {{0x44444,
+ {{0, 0x0, 0x4, {false, false, false, true, true}, {}, 0},
+ {1, 0x4, 0x4, {false, false, false, false, false}, {}, 0},
+ {2, 0x8, 0x4, {false, false, false, false, false}, {}, 0},
+ {3, 0xc, 0x4, {false, false, false, false, false}, {}, 0}}}}};
PGOAnalysisMap P4 = {
1000,
{{BlockFrequency(1000),
@@ -1193,22 +1313,24 @@ Sections:
{3, BranchProbability::getRaw(0xeeee'eeee)}}},
{BlockFrequency(18), {{3, BranchProbability::getRaw(0xffff'ffff)}}},
{BlockFrequency(1000), {}}},
- {true, true, true, false, false, false}};
+ {true, true, true, false, false, false, false}};
BBAddrMap E5 = {
- {{0x55555, {{2, 0x0, 0x2, {false, false, true, false, false}, {}}}}}};
- PGOAnalysisMap P5 = {{}, {}, {false, false, false, false, false, false}};
+ {{0x55555, {{2, 0x0, 0x2, {false, false, true, false, false}, {}, 0}}}}};
+ PGOAnalysisMap P5 = {
+ {}, {}, {false, false, false, false, false, false, false}};
BBAddrMap E6 = {
{{0x66666,
- {{0, 0x0, 0x6, {false, true, true, false, false}, {}},
- {1, 0x6, 0x6, {false, false, true, false, false}, {}}}},
- {0x666661, {{2, 0x0, 0x6, {false, false, false, false, false}, {}}}}}};
+ {{0, 0x0, 0x6, {false, true, true, false, false}, {}, 0},
+ {1, 0x6, 0x6, {false, false, true, false, false}, {}, 0}}},
+ {0x666661,
+ {{2, 0x0, 0x6, {false, false, false, false, false}, {}, 0}}}}};
PGOAnalysisMap P6 = {{},
{{{},
{{1, BranchProbability::getRaw(0x2222'2222)},
{2, BranchProbability::getRaw(0xcccc'cccc)}}},
{{}, {{2, BranchProbability::getRaw(0x8888'8888)}}},
{{}, {}}},
- {false, false, true, true, false, false}};
+ {false, false, true, true, false, false, false}};
std::vector<BBAddrMap> Section0BBAddrMaps = {E4, E5, E6};
std::vector<BBAddrMap> Section1BBAddrMaps = {E3};
diff --git a/llvm/unittests/Object/ELFTypesTest.cpp b/llvm/unittests/Object/ELFTypesTest.cpp
index f88931b5f..1765e15 100644
--- a/llvm/unittests/Object/ELFTypesTest.cpp
+++ b/llvm/unittests/Object/ELFTypesTest.cpp
@@ -101,21 +101,22 @@ static_assert(
"PGOAnalysisMap should use the same type for basic block ID as BBAddrMap");
TEST(ELFTypesTest, BBAddrMapFeaturesEncodingTest) {
- const std::array<BBAddrMap::Features, 11> Decoded = {
- {{false, false, false, false, false, false},
- {true, false, false, false, false, false},
- {false, true, false, false, false, false},
- {false, false, true, false, false, false},
- {false, false, false, true, false, false},
- {true, true, false, false, false, false},
- {false, true, true, false, false, false},
- {false, true, true, true, false, false},
- {true, true, true, true, false, false},
- {false, false, false, false, true, false},
- {false, false, false, false, false, true}}};
- const std::array<uint8_t, 11> Encoded = {{0b0000, 0b0001, 0b0010, 0b0100,
- 0b1000, 0b0011, 0b0110, 0b1110,
- 0b1111, 0b1'0000, 0b10'0000}};
+ const std::array<BBAddrMap::Features, 12> Decoded = {
+ {{false, false, false, false, false, false, false},
+ {true, false, false, false, false, false, false},
+ {false, true, false, false, false, false, false},
+ {false, false, true, false, false, false, false},
+ {false, false, false, true, false, false, false},
+ {true, true, false, false, false, false, false},
+ {false, true, true, false, false, false, false},
+ {false, true, true, true, false, false, false},
+ {true, true, true, true, false, false, false},
+ {false, false, false, false, true, false, false},
+ {false, false, false, false, false, true, false},
+ {false, false, false, false, false, false, true}}};
+ const std::array<uint8_t, 12> Encoded = {
+ {0b0000, 0b0001, 0b0010, 0b0100, 0b1000, 0b0011, 0b0110, 0b1110, 0b1111,
+ 0b1'0000, 0b10'0000, 0b100'0000}};
for (const auto &[Feat, EncodedVal] : llvm::zip(Decoded, Encoded))
EXPECT_EQ(Feat.encode(), EncodedVal);
for (const auto &[Feat, EncodedVal] : llvm::zip(Decoded, Encoded)) {
@@ -128,9 +129,9 @@ TEST(ELFTypesTest, BBAddrMapFeaturesEncodingTest) {
TEST(ELFTypesTest, BBAddrMapFeaturesInvalidEncodingTest) {
const std::array<std::string, 2> Errors = {
- "invalid encoding for BBAddrMap::Features: 0x40",
+ "invalid encoding for BBAddrMap::Features: 0x80",
"invalid encoding for BBAddrMap::Features: 0xf0"};
- const std::array<uint8_t, 2> Values = {{0b100'0000, 0b1111'0000}};
+ const std::array<uint8_t, 2> Values = {{0b1000'0000, 0b1111'0000}};
for (const auto &[Val, Error] : llvm::zip(Values, Errors)) {
EXPECT_THAT_ERROR(BBAddrMap::Features::decode(Val).takeError(),
FailedWithMessage(Error));
diff --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt
index 25efa00..21f10eb 100644
--- a/llvm/unittests/Support/CMakeLists.txt
+++ b/llvm/unittests/Support/CMakeLists.txt
@@ -44,6 +44,7 @@ add_llvm_unittest(SupportTests
ExtensibleRTTITest.cpp
FileCollectorTest.cpp
FileOutputBufferTest.cpp
+ Format.cpp
FormatVariadicTest.cpp
FSUniqueIDTest.cpp
GenericDomTreeTest.cpp
diff --git a/llvm/unittests/Support/Format.cpp b/llvm/unittests/Support/Format.cpp
new file mode 100644
index 0000000..c4e421f
--- /dev/null
+++ b/llvm/unittests/Support/Format.cpp
@@ -0,0 +1,56 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Format.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace {
+
+template <typename FormatTy>
+std::string printToString(unsigned MaxN, FormatTy &&Fmt) {
+ std::vector<char> Dst(MaxN + 2);
+ int N = Fmt.snprint(Dst.data(), Dst.size());
+ Dst.back() = 0;
+ return N < 0 ? "" : Dst.data();
+}
+
+template <typename Expected, typename Arg>
+constexpr bool checkDecayTypeEq(const Arg &arg) {
+ return std::is_same_v<detail::decay_if_c_char_array_t<Arg>, Expected>;
+}
+
+TEST(Format, DecayIfCCharArray) {
+ char Array[] = "Array";
+ const char ConstArray[] = "ConstArray";
+ char PtrBuf[] = "Ptr";
+ char *Ptr = PtrBuf;
+ const char *PtrToConst = "PtrToConst";
+
+ EXPECT_EQ(" Literal", printToString(20, format("%15s", "Literal")));
+ EXPECT_EQ(" Array", printToString(20, format("%15s", Array)));
+ EXPECT_EQ(" ConstArray", printToString(20, format("%15s", ConstArray)));
+ EXPECT_EQ(" Ptr", printToString(20, format("%15s", Ptr)));
+ EXPECT_EQ(" PtrToConst", printToString(20, format("%15s", PtrToConst)));
+
+ EXPECT_TRUE(checkDecayTypeEq<const char *>("Literal"));
+ EXPECT_TRUE(checkDecayTypeEq<const char *>(Array));
+ EXPECT_TRUE(checkDecayTypeEq<const char *>(ConstArray));
+ EXPECT_TRUE(checkDecayTypeEq<char *>(Ptr));
+ EXPECT_TRUE(checkDecayTypeEq<const char *>(PtrToConst));
+ EXPECT_TRUE(checkDecayTypeEq<char>(PtrToConst[0]));
+ EXPECT_TRUE(
+ checkDecayTypeEq<const char *>(static_cast<const char *>("Literal")));
+
+ wchar_t WCharArray[] = L"WCharArray";
+ EXPECT_TRUE(checkDecayTypeEq<wchar_t[11]>(WCharArray));
+ EXPECT_TRUE(checkDecayTypeEq<wchar_t>(WCharArray[0]));
+}
+
+} // namespace
diff --git a/llvm/unittests/Support/SourceMgrTest.cpp b/llvm/unittests/Support/SourceMgrTest.cpp
index 301b64f..c65f001 100644
--- a/llvm/unittests/Support/SourceMgrTest.cpp
+++ b/llvm/unittests/Support/SourceMgrTest.cpp
@@ -8,6 +8,7 @@
#include "llvm/Support/SourceMgr.h"
#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/VirtualFileSystem.h"
#include "llvm/Support/raw_ostream.h"
#include "gtest/gtest.h"
@@ -506,3 +507,13 @@ TEST_F(SourceMgrTest, PrintWithoutLoc) {
Diag.print(nullptr, OS, false, false, false);
EXPECT_EQ("message\n", Output);
}
+
+TEST_F(SourceMgrTest, IncludeDirs) {
+ auto VFS = makeIntrusiveRefCnt<vfs::InMemoryFileSystem>();
+ VFS->addFile("include/file", 0, MemoryBuffer::getMemBuffer("contents"));
+ SM.setVirtualFileSystem(std::move(VFS));
+ SM.setIncludeDirs({"include"});
+ std::string ResolvedPath;
+ unsigned NumBuffers = SM.AddIncludeFile("file", SMLoc(), ResolvedPath);
+ EXPECT_EQ(NumBuffers, 1u);
+}
diff --git a/llvm/unittests/Target/AArch64/AArch64SelectionDAGTest.cpp b/llvm/unittests/Target/AArch64/AArch64SelectionDAGTest.cpp
index c74d157..809960d 100644
--- a/llvm/unittests/Target/AArch64/AArch64SelectionDAGTest.cpp
+++ b/llvm/unittests/Target/AArch64/AArch64SelectionDAGTest.cpp
@@ -177,6 +177,165 @@ TEST_F(AArch64SelectionDAGTest, ComputeNumSignBits_VASHR) {
EXPECT_EQ(DAG->ComputeNumSignBits(Fr2), 5u);
}
+TEST_F(AArch64SelectionDAGTest, ComputeNumSignBits_SUB) {
+ SDLoc Loc;
+ auto IntVT = EVT::getIntegerVT(Context, 8);
+ auto N0 = DAG->getConstant(0x00, Loc, IntVT);
+ auto N1 = DAG->getConstant(0x01, Loc, IntVT);
+ auto N5 = DAG->getConstant(0x05, Loc, IntVT);
+ auto Nsign1 = DAG->getConstant(0x55, Loc, IntVT);
+ auto UnknownOp = DAG->getRegister(0, IntVT);
+ auto Mask = DAG->getConstant(0x1e, Loc, IntVT);
+ auto Nsign3 = DAG->getNode(ISD::AND, Loc, IntVT, Mask, UnknownOp);
+ // RHS early out
+ // Nsign1 = 01010101
+ // Nsign3 = 000????0
+ auto OpRhsEo = DAG->getNode(ISD::SUB, Loc, IntVT, Nsign3, Nsign1);
+ EXPECT_EQ(DAG->ComputeNumSignBits(OpRhsEo), 1u);
+
+ // Neg 0
+ // N0 = 00000000
+ auto OpNegZero = DAG->getNode(ISD::SUB, Loc, IntVT, N0, N0);
+ EXPECT_EQ(DAG->ComputeNumSignBits(OpNegZero), 8u);
+
+ // Neg 1
+ // N0 = 00000000
+ // N1 = 00000001
+ auto OpNegOne = DAG->getNode(ISD::SUB, Loc, IntVT, N0, N1);
+ EXPECT_EQ(DAG->ComputeNumSignBits(OpNegOne), 8u);
+
+ // Neg 5
+ // N0 = 00000000
+ // N5 = 00000101
+ auto OpNegFive = DAG->getNode(ISD::SUB, Loc, IntVT, N0, N5);
+ EXPECT_EQ(DAG->ComputeNumSignBits(OpNegFive), 5u);
+
+ // Non negative
+ // N0 = 00000000
+ // Nsign3 = 000????0
+ auto OpNonNeg = DAG->getNode(ISD::SUB, Loc, IntVT, N0, Nsign3);
+ EXPECT_EQ(DAG->ComputeNumSignBits(OpNonNeg), 3u);
+
+ // LHS early out
+ // Nsign1 = 01010101
+ // Nsign3 = 000????0
+ auto OpLhsEo = DAG->getNode(ISD::SUB, Loc, IntVT, Nsign1, Nsign3);
+ EXPECT_EQ(DAG->ComputeNumSignBits(OpLhsEo), 1u);
+
+ // Nsign3 = 000????0
+ // N5 = 00000101
+ auto Op = DAG->getNode(ISD::SUB, Loc, IntVT, Nsign3, N5);
+ EXPECT_EQ(DAG->ComputeNumSignBits(Op), 2u);
+}
+
+TEST_F(AArch64SelectionDAGTest, ComputeNumSignBits_ADD) {
+ SDLoc Loc;
+ auto IntVT = EVT::getIntegerVT(Context, 8);
+ auto Nneg1 = DAG->getConstant(0xFF, Loc, IntVT);
+ auto N0 = DAG->getConstant(0x00, Loc, IntVT);
+ auto N1 = DAG->getConstant(0x01, Loc, IntVT);
+ auto N5 = DAG->getConstant(0x05, Loc, IntVT);
+ auto N8 = DAG->getConstant(0x08, Loc, IntVT);
+ auto Nsign1 = DAG->getConstant(0x55, Loc, IntVT);
+ auto UnknownOp = DAG->getRegister(0, IntVT);
+ auto Mask = DAG->getConstant(0x1e, Loc, IntVT);
+ auto Nsign3 = DAG->getNode(ISD::AND, Loc, IntVT, Mask, UnknownOp);
+ // RHS early out
+ // Nsign1 = 01010101
+ // Nsign3 = 000????0
+ auto OpRhsEo = DAG->getNode(ISD::ADD, Loc, IntVT, Nsign3, Nsign1);
+ EXPECT_EQ(DAG->ComputeNumSignBits(OpRhsEo), 1u);
+
+ // ADD 0 -1
+ // N0 = 00000000
+ // Nneg1 = 11111111
+ auto OpNegZero = DAG->getNode(ISD::ADD, Loc, IntVT, N0, Nneg1);
+ EXPECT_EQ(DAG->ComputeNumSignBits(OpNegZero), 8u);
+
+ // ADD 1 -1
+ // N1 = 00000001
+ // Nneg1 = 11111111
+ auto OpNegOne = DAG->getNode(ISD::ADD, Loc, IntVT, N1, Nneg1);
+ EXPECT_EQ(DAG->ComputeNumSignBits(OpNegOne), 8u);
+
+ // ADD 8 -1
+ // N8 = 00001000
+ // Nneg1 = 11111111
+ auto OpSeven = DAG->getNode(ISD::ADD, Loc, IntVT, N8, Nneg1);
+ EXPECT_EQ(DAG->ComputeNumSignBits(OpSeven), 5u);
+
+ // Non negative
+ // Nsign3 = 000????0
+ // Nneg1 = 11111111
+ auto OpNonNeg = DAG->getNode(ISD::ADD, Loc, IntVT, Nsign3, Nneg1);
+ EXPECT_EQ(DAG->ComputeNumSignBits(OpNonNeg), 3u);
+
+ // LHS early out
+ // Nsign1 = 01010101
+ // Nsign3 = 000????0
+ auto OpLhsEo = DAG->getNode(ISD::ADD, Loc, IntVT, Nsign1, Nsign3);
+ EXPECT_EQ(DAG->ComputeNumSignBits(OpLhsEo), 1u);
+
+ // Nsign3 = 000????0
+ // N5 = 00000101
+ auto Op = DAG->getNode(ISD::ADD, Loc, IntVT, Nsign3, N5);
+ EXPECT_EQ(DAG->ComputeNumSignBits(Op), 2u);
+}
+
+TEST_F(AArch64SelectionDAGTest, ComputeNumSignBits_ADDC) {
+ SDLoc Loc;
+ auto IntVT = EVT::getIntegerVT(Context, 8);
+ auto Nneg1 = DAG->getConstant(0xFF, Loc, IntVT);
+ auto N0 = DAG->getConstant(0x00, Loc, IntVT);
+ auto N1 = DAG->getConstant(0x01, Loc, IntVT);
+ auto N5 = DAG->getConstant(0x05, Loc, IntVT);
+ auto N8 = DAG->getConstant(0x08, Loc, IntVT);
+ auto Nsign1 = DAG->getConstant(0x55, Loc, IntVT);
+ auto UnknownOp = DAG->getRegister(0, IntVT);
+ auto Mask = DAG->getConstant(0x1e, Loc, IntVT);
+ auto Nsign3 = DAG->getNode(ISD::AND, Loc, IntVT, Mask, UnknownOp);
+ // RHS early out
+ // Nsign1 = 01010101
+ // Nsign3 = 000????0
+ auto OpRhsEo = DAG->getNode(ISD::ADDC, Loc, IntVT, Nsign3, Nsign1);
+ EXPECT_EQ(DAG->ComputeNumSignBits(OpRhsEo), 1u);
+
+ // ADD 0 -1
+ // N0 = 00000000
+ // Nneg1 = 11111111
+ auto OpNegZero = DAG->getNode(ISD::ADDC, Loc, IntVT, N0, Nneg1);
+ EXPECT_EQ(DAG->ComputeNumSignBits(OpNegZero), 8u);
+
+ // ADD 1 -1
+ // N1 = 00000001
+ // Nneg1 = 11111111
+ auto OpNegOne = DAG->getNode(ISD::ADDC, Loc, IntVT, N1, Nneg1);
+ EXPECT_EQ(DAG->ComputeNumSignBits(OpNegOne), 8u);
+
+ // ADD 8 -1
+ // N8 = 00001000
+ // Nneg1 = 11111111
+ auto OpSeven = DAG->getNode(ISD::ADDC, Loc, IntVT, N8, Nneg1);
+ EXPECT_EQ(DAG->ComputeNumSignBits(OpSeven), 4u);
+
+ // Non negative
+ // Nsign3 = 000????0
+ // Nneg1 = 11111111
+ auto OpNonNeg = DAG->getNode(ISD::ADDC, Loc, IntVT, Nsign3, Nneg1);
+ EXPECT_EQ(DAG->ComputeNumSignBits(OpNonNeg), 3u);
+
+ // LHS early out
+ // Nsign1 = 01010101
+ // Nsign3 = 000????0
+ auto OpLhsEo = DAG->getNode(ISD::ADDC, Loc, IntVT, Nsign1, Nsign3);
+ EXPECT_EQ(DAG->ComputeNumSignBits(OpLhsEo), 1u);
+
+ // Nsign3 = 000????0
+ // N5 = 00000101
+ auto Op = DAG->getNode(ISD::ADDC, Loc, IntVT, Nsign3, N5);
+ EXPECT_EQ(DAG->ComputeNumSignBits(Op), 2u);
+}
+
TEST_F(AArch64SelectionDAGTest, SimplifyDemandedVectorElts_EXTRACT_SUBVECTOR) {
TargetLowering TL(*TM);
diff --git a/llvm/unittests/Transforms/Utils/SSAUpdaterBulkTest.cpp b/llvm/unittests/Transforms/Utils/SSAUpdaterBulkTest.cpp
index 841f44c..716f5f2 100644
--- a/llvm/unittests/Transforms/Utils/SSAUpdaterBulkTest.cpp
+++ b/llvm/unittests/Transforms/Utils/SSAUpdaterBulkTest.cpp
@@ -308,3 +308,223 @@ TEST(SSAUpdaterBulk, TwoBBLoop) {
EXPECT_EQ(Phi->getIncomingValueForBlock(Entry), ConstantInt::get(I32Ty, 0));
EXPECT_EQ(Phi->getIncomingValueForBlock(Loop), I);
}
+
+TEST(SSAUpdaterBulk, SimplifyPHIs) {
+ const char *IR = R"(
+ define void @main(i32 %val, i1 %cond) {
+ entry:
+ br i1 %cond, label %left, label %right
+ left:
+ %add = add i32 %val, 1
+ br label %exit
+ right:
+ %sub = sub i32 %val, 1
+ br label %exit
+ exit:
+ %phi = phi i32 [ %sub, %right ], [ %add, %left ]
+ %cmp = icmp slt i32 0, 42
+ ret void
+ }
+ )";
+
+ llvm::LLVMContext Context;
+ llvm::SMDiagnostic Err;
+ std::unique_ptr<llvm::Module> M = llvm::parseAssemblyString(IR, Err, Context);
+ ASSERT_NE(M, nullptr) << "Failed to parse IR: " << Err.getMessage();
+
+ Function *F = M->getFunction("main");
+ auto *Entry = &F->getEntryBlock();
+ auto *Left = Entry->getTerminator()->getSuccessor(0);
+ auto *Right = Entry->getTerminator()->getSuccessor(1);
+ auto *Exit = Left->getSingleSuccessor();
+ auto *Val = &*F->arg_begin();
+ auto *Phi = &Exit->front();
+ auto *Cmp = &*std::next(Exit->begin());
+ auto *Add = &Left->front();
+ auto *Sub = &Right->front();
+
+ SSAUpdaterBulk Updater;
+ Type *I32Ty = Type::getInt32Ty(Context);
+
+ // Use %val directly instead of creating a phi.
+ unsigned ValVar = Updater.AddVariable("Val", I32Ty);
+ Updater.AddAvailableValue(ValVar, Left, Val);
+ Updater.AddAvailableValue(ValVar, Right, Val);
+ Updater.AddUse(ValVar, &Cmp->getOperandUse(0));
+
+ // Use existing %phi for %add and %sub values.
+ unsigned AddSubVar = Updater.AddVariable("AddSub", I32Ty);
+ Updater.AddAvailableValue(AddSubVar, Left, Add);
+ Updater.AddAvailableValue(AddSubVar, Right, Sub);
+ Updater.AddUse(AddSubVar, &Cmp->getOperandUse(1));
+
+ auto ExitSizeBefore = Exit->size();
+ DominatorTree DT(*F);
+ Updater.RewriteAndOptimizeAllUses(DT);
+
+ // Output for Exit->dump():
+ // exit: ; preds = %right, %left
+ // %phi = phi i32 [ %sub, %right ], [ %add, %left ]
+ // %cmp = icmp slt i32 %val, %phi
+ // ret void
+
+ ASSERT_EQ(Exit->size(), ExitSizeBefore);
+ ASSERT_EQ(&Exit->front(), Phi);
+ EXPECT_EQ(Val, Cmp->getOperand(0));
+ EXPECT_EQ(Phi, Cmp->getOperand(1));
+}
+
+bool EliminateNewDuplicatePHINodes(BasicBlock *BB,
+ BasicBlock::phi_iterator FirstExistingPN);
+
+// Helper to run both versions on the same input.
+static void RunEliminateNewDuplicatePHINode(
+ const char *AsmText,
+ std::function<void(BasicBlock &,
+ bool(BasicBlock *BB, BasicBlock::phi_iterator))>
+ Check) {
+ LLVMContext C;
+
+ SMDiagnostic Err;
+ std::unique_ptr<Module> M = parseAssemblyString(AsmText, Err, C);
+ if (!M) {
+ Err.print("UtilsTests", errs());
+ return;
+ }
+
+ Function *F = M->getFunction("main");
+ auto BBIt = std::find_if(F->begin(), F->end(), [](const BasicBlock &Block) {
+ return Block.getName() == "testbb";
+ });
+ ASSERT_NE(BBIt, F->end());
+ Check(*BBIt, EliminateNewDuplicatePHINodes);
+}
+
+static BasicBlock::phi_iterator getPhiIt(BasicBlock &BB, unsigned Idx) {
+ return std::next(BB.phis().begin(), Idx);
+}
+
+static PHINode *getPhi(BasicBlock &BB, unsigned Idx) {
+ return &*getPhiIt(BB, Idx);
+}
+
+static int getNumPHIs(BasicBlock &BB) {
+ return std::distance(BB.phis().begin(), BB.phis().end());
+}
+
+TEST(SSAUpdaterBulk, EliminateNewDuplicatePHINodes_OrderExisting) {
+ RunEliminateNewDuplicatePHINode(R"(
+ define void @main() {
+ entry:
+ br label %testbb
+ testbb:
+ %np0 = phi i32 [ 1, %entry ]
+ %np1 = phi i32 [ 1, %entry ]
+ %ep0 = phi i32 [ 1, %entry ]
+ %ep1 = phi i32 [ 1, %entry ]
+ %u = add i32 %np0, %np1
+ ret void
+ }
+ )", [](BasicBlock &BB, auto *ENDPN) {
+ AssertingVH<PHINode> EP0 = getPhi(BB, 2);
+ AssertingVH<PHINode> EP1 = getPhi(BB, 3);
+ EXPECT_TRUE(ENDPN(&BB, getPhiIt(BB, 2)));
+ // Expected:
+ // %ep0 = phi i32 [ 1, %entry ]
+ // %ep1 = phi i32 [ 1, %entry ]
+ // %u = add i32 %ep0, %ep0
+ EXPECT_EQ(getNumPHIs(BB), 2);
+ Instruction &Add = *BB.getFirstNonPHIIt();
+ EXPECT_EQ(Add.getOperand(0), EP0);
+ EXPECT_EQ(Add.getOperand(1), EP0);
+ (void)EP1; // Avoid "unused" warning.
+ });
+}
+
+TEST(SSAUpdaterBulk, EliminateNewDuplicatePHINodes_OrderNew) {
+ RunEliminateNewDuplicatePHINode(R"(
+ define void @main() {
+ entry:
+ br label %testbb
+ testbb:
+ %np0 = phi i32 [ 1, %entry ]
+ %np1 = phi i32 [ 1, %entry ]
+ %ep0 = phi i32 [ 2, %entry ]
+ %ep1 = phi i32 [ 2, %entry ]
+ %u = add i32 %np0, %np1
+ ret void
+ }
+ )", [](BasicBlock &BB, auto *ENDPN) {
+ AssertingVH<PHINode> NP0 = getPhi(BB, 0);
+ AssertingVH<PHINode> EP0 = getPhi(BB, 2);
+ AssertingVH<PHINode> EP1 = getPhi(BB, 3);
+ EXPECT_TRUE(ENDPN(&BB, getPhiIt(BB, 2)));
+ // Expected:
+ // %np0 = phi i32 [ 1, %entry ]
+ // %ep0 = phi i32 [ 2, %entry ]
+ // %ep1 = phi i32 [ 2, %entry ]
+ // %u = add i32 %np0, %np0
+ EXPECT_EQ(getNumPHIs(BB), 3);
+ Instruction &Add = *BB.getFirstNonPHIIt();
+ EXPECT_EQ(Add.getOperand(0), NP0);
+ EXPECT_EQ(Add.getOperand(1), NP0);
+ (void)EP0;
+ (void)EP1; // Avoid "unused" warning.
+ });
+}
+
+TEST(SSAUpdaterBulk, EliminateNewDuplicatePHINodes_NewRefExisting) {
+ RunEliminateNewDuplicatePHINode(R"(
+ define void @main() {
+ entry:
+ br label %testbb
+ testbb:
+ %np0 = phi i32 [ 1, %entry ], [ %ep0, %testbb ]
+ %np1 = phi i32 [ 1, %entry ], [ %ep1, %testbb ]
+ %ep0 = phi i32 [ 1, %entry ], [ %ep0, %testbb ]
+ %ep1 = phi i32 [ 1, %entry ], [ %ep1, %testbb ]
+ %u = add i32 %np0, %np1
+ br label %testbb
+ }
+ )", [](BasicBlock &BB, auto *ENDPN) {
+ AssertingVH<PHINode> EP0 = getPhi(BB, 2);
+ AssertingVH<PHINode> EP1 = getPhi(BB, 3);
+ EXPECT_TRUE(ENDPN(&BB, getPhiIt(BB, 2)));
+ // Expected:
+ // %ep0 = phi i32 [ 1, %entry ], [ %ep0, %testbb ]
+ // %ep1 = phi i32 [ 1, %entry ], [ %ep1, %testbb ]
+ // %u = add i32 %ep0, %ep1
+ EXPECT_EQ(getNumPHIs(BB), 2);
+ Instruction &Add = *BB.getFirstNonPHIIt();
+ EXPECT_EQ(Add.getOperand(0), EP0);
+ EXPECT_EQ(Add.getOperand(1), EP1);
+ });
+}
+
+TEST(SSAUpdaterBulk, EliminateNewDuplicatePHINodes_ExistingRefNew) {
+ RunEliminateNewDuplicatePHINode(R"(
+ define void @main() {
+ entry:
+ br label %testbb
+ testbb:
+ %np0 = phi i32 [ 1, %entry ], [ %np0, %testbb ]
+ %np1 = phi i32 [ 1, %entry ], [ %np1, %testbb ]
+ %ep0 = phi i32 [ 1, %entry ], [ %np0, %testbb ]
+ %ep1 = phi i32 [ 1, %entry ], [ %np1, %testbb ]
+ %u = add i32 %np0, %np1
+ br label %testbb
+ }
+ )", [](BasicBlock &BB, auto *ENDPN) {
+ AssertingVH<PHINode> EP0 = getPhi(BB, 2);
+ AssertingVH<PHINode> EP1 = getPhi(BB, 3);
+ EXPECT_TRUE(ENDPN(&BB, getPhiIt(BB, 2)));
+ // Expected:
+ // %ep0 = phi i32 [ 1, %entry ], [ %ep0, %testbb ]
+ // %ep1 = phi i32 [ 1, %entry ], [ %ep1, %testbb ]
+ // %u = add i32 %ep0, %ep1
+ EXPECT_EQ(getNumPHIs(BB), 2);
+ Instruction &Add = *BB.getFirstNonPHIIt();
+ EXPECT_EQ(Add.getOperand(0), EP0);
+ EXPECT_EQ(Add.getOperand(1), EP1);
+ });
+}
diff --git a/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp b/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp
index 55b68f5..2a0f500 100644
--- a/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp
@@ -45,8 +45,7 @@ TEST_F(VPDominatorTreeTest, DominanceNoRegionsTest) {
VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader());
- VPDominatorTree VPDT;
- VPDT.recalculate(Plan);
+ VPDominatorTree VPDT(Plan);
EXPECT_TRUE(VPDT.dominates(VPBB1, VPBB4));
EXPECT_FALSE(VPDT.dominates(VPBB4, VPBB1));
@@ -118,8 +117,7 @@ TEST_F(VPDominatorTreeTest, DominanceRegionsTest) {
VPBlockUtils::connectBlocks(R1, R2);
VPBlockUtils::connectBlocks(R2, Plan.getScalarHeader());
- VPDominatorTree VPDT;
- VPDT.recalculate(Plan);
+ VPDominatorTree VPDT(Plan);
checkDomChildren(VPDT, R1, {R1BB1});
checkDomChildren(VPDT, R1BB1, {R1BB2, R1BB4, R1BB3});
@@ -197,8 +195,7 @@ TEST_F(VPDominatorTreeTest, DominanceRegionsTest) {
VPBlockUtils::connectBlocks(R1, VPBB2);
VPBlockUtils::connectBlocks(VPBB2, Plan.getScalarHeader());
- VPDominatorTree VPDT;
- VPDT.recalculate(Plan);
+ VPDominatorTree VPDT(Plan);
checkDomChildren(VPDT, VPBB1, {R1});
checkDomChildren(VPDT, R1, {R1BB1});
diff --git a/llvm/utils/Misc/zkill b/llvm/utils/Misc/zkill
index bc0bfd5..8e10144 100755
--- a/llvm/utils/Misc/zkill
+++ b/llvm/utils/Misc/zkill
@@ -14,7 +14,7 @@ def _write_message(kind, message):
file,line,_,_,_ = inspect.getframeinfo(f)
location = '%s:%d' % (os.path.basename(file), line)
- print >>sys.stderr, '%s: %s: %s' % (location, kind, message)
+ print('%s: %s: %s' % (location, kind, message), file=sys.stderr)
note = lambda message: _write_message('note', message)
warning = lambda message: _write_message('warning', message)
@@ -53,7 +53,7 @@ def extractExecutable(command):
class Struct:
def __init__(self, **kwargs):
- self.fields = kwargs.keys()
+ self.fields = list(kwargs.keys())
self.__dict__.update(kwargs)
def __repr__(self):
@@ -144,7 +144,7 @@ def main():
parser.add_option("-s", "", dest="signalName",
help="Name of the signal to use (default=%default)",
action="store", default='INT',
- choices=kSignals.keys())
+ choices=list(kSignals.keys()))
parser.add_option("-l", "", dest="listSignals",
help="List known signal names",
action="store_true", default=False)
@@ -202,18 +202,18 @@ def main():
(opts, args) = parser.parse_args()
if opts.listSignals:
- items = [(v,k) for k,v in kSignals.items()]
+ items = [(v,k) for k,v in list(kSignals.items())]
items.sort()
for i in range(0, len(items), 4):
- print '\t'.join(['%2d) SIG%s' % (k,v)
- for k,v in items[i:i+4]])
+ print('\t'.join(['%2d) SIG%s' % (k,v)
+ for k,v in items[i:i+4]]))
sys.exit(0)
# Figure out the signal to use.
signal = kSignals[opts.signalName]
signalValueName = str(signal)
if opts.verbose:
- name = dict((v,k) for k,v in kSignals.items()).get(signal,None)
+ name = dict((v,k) for k,v in list(kSignals.items())).get(signal,None)
if name:
signalValueName = name
note('using signal %d (SIG%s)' % (signal, name))
diff --git a/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp
index b4d816e..3c6ff11 100644
--- a/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp
@@ -266,10 +266,9 @@ static void emitDirectivesDecl(const RecordKeeper &Records, raw_ostream &OS) {
return;
StringRef Lang = DirLang.getName();
+ IncludeGuardEmitter IncGuard(OS, (Twine("LLVM_") + Lang + "_INC").str());
- OS << "#ifndef LLVM_" << Lang << "_INC\n";
- OS << "#define LLVM_" << Lang << "_INC\n";
- OS << "\n#include \"llvm/ADT/ArrayRef.h\"\n";
+ OS << "#include \"llvm/ADT/ArrayRef.h\"\n";
if (DirLang.hasEnableBitmaskEnumInNamespace())
OS << "#include \"llvm/ADT/BitmaskEnum.h\"\n";
@@ -370,7 +369,6 @@ static void emitDirectivesDecl(const RecordKeeper &Records, raw_ostream &OS) {
OS << "};\n";
}
LlvmNS.close();
- OS << "#endif // LLVM_" << Lang << "_INC\n";
}
// Given a list of spellings (for a given clause/directive), order them
diff --git a/llvm/utils/TableGen/Basic/RISCVTargetDefEmitter.cpp b/llvm/utils/TableGen/Basic/RISCVTargetDefEmitter.cpp
index df14c77..f795937 100644
--- a/llvm/utils/TableGen/Basic/RISCVTargetDefEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/RISCVTargetDefEmitter.cpp
@@ -68,13 +68,14 @@ static void emitRISCVExtensions(const RecordKeeper &Records, raw_ostream &OS) {
if (!Extensions.empty()) {
OS << "\nstatic constexpr ImpliedExtsEntry ImpliedExts[] = {\n";
for (const Record *Ext : Extensions) {
- auto ImpliesList = Ext->getValueAsListOfDefs("Implies");
+ std::vector<const Record *> ImpliesList =
+ Ext->getValueAsListOfDefs("Implies");
if (ImpliesList.empty())
continue;
StringRef Name = getExtensionName(Ext);
- for (auto *ImpliedExt : ImpliesList) {
+ for (const Record *ImpliedExt : ImpliesList) {
if (!ImpliedExt->isSubClassOf("RISCVExtension"))
continue;
@@ -150,11 +151,12 @@ static void emitRISCVProfiles(const RecordKeeper &Records, raw_ostream &OS) {
OS << "#ifdef GET_SUPPORTED_PROFILES\n";
OS << "#undef GET_SUPPORTED_PROFILES\n\n";
- auto Profiles = Records.getAllDerivedDefinitionsIfDefined("RISCVProfile");
+ ArrayRef<const Record *> Profiles =
+ Records.getAllDerivedDefinitionsIfDefined("RISCVProfile");
if (!Profiles.empty()) {
printProfileTable(OS, Profiles, /*Experimental=*/false);
- bool HasExperimentalProfiles = any_of(Profiles, [&](auto &Rec) {
+ bool HasExperimentalProfiles = any_of(Profiles, [&](const Record *Rec) {
return Rec->getValueAsBit("Experimental");
});
if (HasExperimentalProfiles)
@@ -173,15 +175,17 @@ static void emitRISCVProcs(const RecordKeeper &RK, raw_ostream &OS) {
// Iterate on all definition records.
for (const Record *Rec :
RK.getAllDerivedDefinitionsIfDefined("RISCVProcessorModel")) {
- const std::vector<const Record *> &Features =
+ std::vector<const Record *> Features =
Rec->getValueAsListOfDefs("Features");
- bool FastScalarUnalignedAccess = any_of(Features, [&](auto &Feature) {
- return Feature->getValueAsString("Name") == "unaligned-scalar-mem";
- });
-
- bool FastVectorUnalignedAccess = any_of(Features, [&](auto &Feature) {
- return Feature->getValueAsString("Name") == "unaligned-vector-mem";
- });
+ bool FastScalarUnalignedAccess =
+ any_of(Features, [&](const Record *Feature) {
+ return Feature->getValueAsString("Name") == "unaligned-scalar-mem";
+ });
+
+ bool FastVectorUnalignedAccess =
+ any_of(Features, [&](const Record *Feature) {
+ return Feature->getValueAsString("Name") == "unaligned-vector-mem";
+ });
OS << "PROC(" << Rec->getName() << ", {\"" << Rec->getValueAsString("Name")
<< "\"}, {\"";
diff --git a/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp b/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
index fd8ddb1..3938d39 100644
--- a/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
@@ -592,7 +592,7 @@ void RuntimeLibcallEmitter::emitSystemRuntimeLibrarySetCalls(
DenseMap<PredicateWithCC, LibcallsWithCC> Pred2Funcs;
SmallVector<uint64_t, 32> BitsetValues(
- divideCeil(RuntimeLibcallImplDefList.size(), BitsPerStorageElt));
+ divideCeil(RuntimeLibcallImplDefList.size() + 1, BitsPerStorageElt));
for (const Record *Elt : *Elements) {
const RuntimeLibcallImpl *LibCallImpl = getRuntimeLibcallImpl(Elt);
diff --git a/llvm/utils/TableGen/Common/Types.cpp b/llvm/utils/TableGen/Common/Types.cpp
index 35b79b3..8e8d6f6 100644
--- a/llvm/utils/TableGen/Common/Types.cpp
+++ b/llvm/utils/TableGen/Common/Types.cpp
@@ -8,16 +8,12 @@
#include "Types.h"
-// For LLVM_ATTRIBUTE_UNUSED
-#include "llvm/Support/Compiler.h"
-
#include <cassert>
using namespace llvm;
-const char *
-llvm::getMinimalTypeForRange(uint64_t Range,
- unsigned MaxSize LLVM_ATTRIBUTE_UNUSED) {
+const char *llvm::getMinimalTypeForRange(uint64_t Range,
+ [[maybe_unused]] unsigned MaxSize) {
// TODO: The original callers only used 32 and 64 so these are the only
// values permitted. Rather than widen the supported values we should
// allow 64 for the callers that currently use 32 and remove the
diff --git a/llvm/utils/TableGen/FastISelEmitter.cpp b/llvm/utils/TableGen/FastISelEmitter.cpp
index dba8bde..e0be104 100644
--- a/llvm/utils/TableGen/FastISelEmitter.cpp
+++ b/llvm/utils/TableGen/FastISelEmitter.cpp
@@ -555,7 +555,7 @@ void FastISelMap::collectPatterns(const CodeGenDAGPatterns &CGP) {
raw_string_ostream SuffixOS(ManglingSuffix);
Operands.PrintManglingSuffix(SuffixOS, ImmediatePredicates, true);
if (!StringSwitch<bool>(ManglingSuffix)
- .Cases("", "r", "rr", "ri", "i", "f", true)
+ .Cases({"", "r", "rr", "ri", "i", "f"}, true)
.Default(false))
continue;
diff --git a/llvm/utils/TableGen/X86DisassemblerTables.cpp b/llvm/utils/TableGen/X86DisassemblerTables.cpp
index ed7a4fe..3414190 100644
--- a/llvm/utils/TableGen/X86DisassemblerTables.cpp
+++ b/llvm/utils/TableGen/X86DisassemblerTables.cpp
@@ -99,6 +99,7 @@ static inline bool inheritsFrom(InstructionContext child,
(noPrefix && inheritsFrom(child, IC_XS, noPrefix)));
case IC_64BIT:
return (inheritsFrom(child, IC_64BIT_REXW) ||
+ inheritsFrom(child, IC_64BIT_REX2) ||
(noPrefix && inheritsFrom(child, IC_64BIT_OPSIZE, noPrefix)) ||
(!AdSize64 && inheritsFrom(child, IC_64BIT_ADSIZE)) ||
(noPrefix && inheritsFrom(child, IC_64BIT_XD, noPrefix)) ||
@@ -151,8 +152,10 @@ static inline bool inheritsFrom(InstructionContext child,
case IC_64BIT_REXW_XS:
case IC_64BIT_REXW_OPSIZE:
case IC_64BIT_REXW_ADSIZE:
- case IC_64BIT_REX2:
+ case IC_64BIT_REX2_REXW:
return false;
+ case IC_64BIT_REX2:
+ return inheritsFrom(child, IC_64BIT_REX2_REXW);
case IC_VEX:
return (VEX_LIG && WIG && inheritsFrom(child, IC_VEX_L_W)) ||
(WIG && inheritsFrom(child, IC_VEX_W)) ||
@@ -980,9 +983,11 @@ void DisassemblerTables::emitContextTable(raw_ostream &o, unsigned &i) const {
if ((index & ATTR_EVEXB) && (index & ATTR_EVEXU))
o << "_U";
}
- } else if ((index & ATTR_64BIT) && (index & ATTR_REX2))
+ } else if ((index & ATTR_64BIT) && (index & ATTR_REX2)) {
o << "IC_64BIT_REX2";
- else if ((index & ATTR_64BIT) && (index & ATTR_REXW) && (index & ATTR_XS))
+ if (index & ATTR_REXW)
+ o << "_REXW";
+ } else if ((index & ATTR_64BIT) && (index & ATTR_REXW) && (index & ATTR_XS))
o << "IC_64BIT_REXW_XS";
else if ((index & ATTR_64BIT) && (index & ATTR_REXW) && (index & ATTR_XD))
o << "IC_64BIT_REXW_XD";
diff --git a/llvm/utils/TableGen/X86RecognizableInstr.cpp b/llvm/utils/TableGen/X86RecognizableInstr.cpp
index e87a1c9..a006888 100644
--- a/llvm/utils/TableGen/X86RecognizableInstr.cpp
+++ b/llvm/utils/TableGen/X86RecognizableInstr.cpp
@@ -365,6 +365,8 @@ InstructionContext RecognizableInstr::insnContext() const {
insnContext = IC_64BIT_XD;
else if (OpPrefix == X86Local::XS)
insnContext = IC_64BIT_XS;
+ else if (HasREX_W && ExplicitREX2Prefix)
+ insnContext = IC_64BIT_REX2_REXW;
else if (ExplicitREX2Prefix)
insnContext = IC_64BIT_REX2;
else if (HasREX_W)
diff --git a/llvm/utils/clang-parse-diagnostics-file b/llvm/utils/clang-parse-diagnostics-file
index 1f720c3..fac5866 100755
--- a/llvm/utils/clang-parse-diagnostics-file
+++ b/llvm/utils/clang-parse-diagnostics-file
@@ -87,14 +87,14 @@ Utility for dumping Clang-style logged diagnostics.\
return
# Otherwise, print out the diagnostics.
- print
- print "**** BUILD DIAGNOSTICS ****"
+ print()
+ print("**** BUILD DIAGNOSTICS ****")
for file,selected_diags in to_report:
- print "*** %s ***" % file
+ print(("*** %s ***" % file))
for d in selected_diags:
- print " %s:%s:%s: %s: %s" % (
+ print((" %s:%s:%s: %s: %s" % (
d.get('filename'), d.get('line'), d.get('column'),
- d.get('level'), d.get('message'))
+ d.get('level'), d.get('message'))))
if __name__ == "__main__":
main()
diff --git a/llvm/utils/git/code-format-helper.py b/llvm/utils/git/code-format-helper.py
index 6f809c5..406a728 100755
--- a/llvm/utils/git/code-format-helper.py
+++ b/llvm/utils/git/code-format-helper.py
@@ -205,9 +205,10 @@ class ClangFormatHelper(FormatHelper):
@property
def instructions(self) -> str:
- # TODO(boomanaiden154): Add --diff_from_common_commit option when it has
- # landed as in available in a released version.
- return " ".join(self._construct_command(["origin/main", "HEAD"]))
+ return (
+ " ".join(self._construct_command(["origin/main", "HEAD"]))
+ + " --diff_from_common_commit"
+ )
def should_include_extensionless_file(self, path: str) -> bool:
return path.startswith("libcxx/include")
@@ -390,7 +391,7 @@ You can test this locally with the following command:
return None
# Use git to find files that have had a change in the number of undefs
- regex = "([^a-zA-Z0-9#_-]undef[^a-zA-Z0-9_-]|UndefValue::get)"
+ regex = "([^a-zA-Z0-9#_-]undef([^a-zA-Z0-9_-]|$)|UndefValue::get)"
cmd = ["git", "diff", "-U0", "--pickaxe-regex", "-S", regex]
if args.start_rev and args.end_rev:
diff --git a/llvm/utils/gn/secondary/clang/lib/Analysis/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Analysis/BUILD.gn
index 5f9eb9a..fe212d1 100644
--- a/llvm/utils/gn/secondary/clang/lib/Analysis/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Analysis/BUILD.gn
@@ -5,6 +5,7 @@ static_library("Analysis") {
"//clang/include/clang/AST:StmtDataCollectors",
"//clang/lib/AST",
"//clang/lib/ASTMatchers",
+ "//clang/lib/Analysis/LifetimeSafety",
"//clang/lib/Basic",
"//clang/lib/Lex",
"//llvm/lib/Support",
@@ -27,8 +28,6 @@ static_library("Analysis") {
"FixitUtil.cpp",
"IntervalPartition.cpp",
"IssueHash.cpp",
- "LifetimeAnnotations.cpp",
- "LifetimeSafety.cpp",
"LiveVariables.cpp",
"MacroExpansionContext.cpp",
"ObjCNoReturn.cpp",
diff --git a/llvm/utils/gn/secondary/clang/lib/Analysis/LifetimeSafety/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Analysis/LifetimeSafety/BUILD.gn
new file mode 100644
index 0000000..7f962c4
--- /dev/null
+++ b/llvm/utils/gn/secondary/clang/lib/Analysis/LifetimeSafety/BUILD.gn
@@ -0,0 +1,20 @@
+static_library("LifetimeSafety") {
+ output_name = "clangAnalysisLifetimeSafety"
+ configs += [ "//llvm/utils/gn/build:clang_code" ]
+ deps = [
+ "//clang/lib/AST",
+ "//clang/lib/Basic",
+ "//llvm/lib/Support",
+ ]
+ sources = [
+ "Checker.cpp",
+ "Facts.cpp",
+ "FactsGenerator.cpp",
+ "LifetimeAnnotations.cpp",
+ "LifetimeSafety.cpp",
+ "LiveOrigins.cpp",
+ "LoanPropagation.cpp",
+ "Loans.cpp",
+ "Origins.cpp",
+ ]
+}
diff --git a/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn
index 1afd342..c9f3a074 100644
--- a/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn
@@ -31,6 +31,7 @@ unittest("ClangAnalysisFlowSensitiveTests") {
"LoggerTest.cpp",
"MapLatticeTest.cpp",
"MatchSwitchTest.cpp",
+ "MockHeaders.cpp",
"MultiVarConstantPropagationTest.cpp",
"RecordOpsTest.cpp",
"SignAnalysisTest.cpp",
diff --git a/llvm/utils/gn/secondary/clang/unittests/Basic/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/Basic/BUILD.gn
index 1449dc7..954de88 100644
--- a/llvm/utils/gn/secondary/clang/unittests/Basic/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/Basic/BUILD.gn
@@ -14,6 +14,7 @@ unittest("BasicTests") {
"DiagnosticTest.cpp",
"FileEntryTest.cpp",
"FileManagerTest.cpp",
+ "LangOptionsTest.cpp",
"LineOffsetMappingTest.cpp",
"OffloadArchTest.cpp",
"SanitizersTest.cpp",
diff --git a/llvm/utils/gn/secondary/clang/unittests/StaticAnalyzer/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/StaticAnalyzer/BUILD.gn
index 0b9282e..d5a25f9 100644
--- a/llvm/utils/gn/secondary/clang/unittests/StaticAnalyzer/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/StaticAnalyzer/BUILD.gn
@@ -33,6 +33,7 @@ unittest("StaticAnalysisTests") {
"StoreTest.cpp",
"SymbolReaperTest.cpp",
"TestReturnValueUnderConstruction.cpp",
+ "UnsignedStatDemo.cpp",
"Z3CrosscheckOracleTest.cpp",
]
}
diff --git a/llvm/utils/gn/secondary/lld/test/BUILD.gn b/llvm/utils/gn/secondary/lld/test/BUILD.gn
index dabc578..585e0a4 100644
--- a/llvm/utils/gn/secondary/lld/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/lld/test/BUILD.gn
@@ -1,5 +1,6 @@
import("//llvm/lib/DebugInfo/PDB/enable_dia.gni")
import("//llvm/triples.gni")
+import("//llvm/utils/gn/build/libs/pthread/enable.gni")
import("//llvm/utils/gn/build/libs/xml/enable.gni")
import("//llvm/utils/gn/build/libs/zlib/enable.gni")
import("//llvm/utils/gn/build/libs/zstd/enable.gni")
@@ -88,6 +89,12 @@ write_lit_cfg("lit_site_cfg") {
extra_values += [ "LLVM_ENABLE_LIBXML2=0" ] # Must be 0.
}
+ if (llvm_enable_threads) {
+ extra_values += [ "LLVM_ENABLE_THREADS=1" ]
+ } else {
+ extra_values += [ "LLVM_ENABLE_THREADS=0" ] # Must be 0.
+ }
+
if (llvm_enable_zlib) {
extra_values += [ "LLVM_ENABLE_ZLIB=1" ]
} else {
diff --git a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn
index 9b69a44..8438421 100644
--- a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn
@@ -68,6 +68,7 @@ static_library("Orc") {
"SectCreate.cpp",
"SelfExecutorProcessControl.cpp",
"SimpleRemoteEPC.cpp",
+ "SimpleRemoteMemoryMapper.cpp",
"SpeculateAnalyses.cpp",
"Speculation.cpp",
"TaskDispatch.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn
index a25f058..4553968 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn
@@ -48,6 +48,7 @@ unittest("SupportTests") {
"FSUniqueIDTest.cpp",
"FileCollectorTest.cpp",
"FileOutputBufferTest.cpp",
+ "Format.cpp",
"FormatVariadicTest.cpp",
"GenericDomTreeTest.cpp",
"GlobPatternTest.cpp",
diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt
index bdcb8a3..3f8be5e 100644
--- a/llvm/utils/profcheck-xfail.txt
+++ b/llvm/utils/profcheck-xfail.txt
@@ -906,7 +906,6 @@ Transforms/InstCombine/select_frexp.ll
Transforms/InstCombine/select.ll
Transforms/InstCombine/select-min-max.ll
Transforms/InstCombine/select-of-symmetric-selects.ll
-Transforms/InstCombine/select-safe-impliedcond-transforms.ll
Transforms/InstCombine/select-safe-transforms.ll
Transforms/InstCombine/select-select.ll
Transforms/InstCombine/select-with-extreme-eq-cond.ll
@@ -1129,6 +1128,7 @@ Transforms/LowerIFunc/ifunc-alias.ll
Transforms/LowerIFunc/ifunc-nonsense-resolvers.ll
Transforms/LowerIFunc/ifunc-program-addrspace.ll
Transforms/LowerIFunc/lower-ifunc.ll
+Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
Transforms/LowerMatrixIntrinsics/multiply-fused-dominance.ll
Transforms/LowerMatrixIntrinsics/multiply-fused.ll
Transforms/LowerMatrixIntrinsics/multiply-fused-loops.ll
@@ -1236,7 +1236,6 @@ Transforms/PartiallyInlineLibCalls/X86/good-prototype.ll
Transforms/PGOProfile/chr-dead-pred.ll
Transforms/PGOProfile/chr-dup-threshold.ll
Transforms/PGOProfile/chr-lifetimes.ll
-Transforms/PGOProfile/chr.ll
Transforms/PGOProfile/chr-poison.ll
Transforms/PGOProfile/comdat.ll
Transforms/PGOProfile/memop_profile_funclet_wasm.ll
@@ -1311,93 +1310,6 @@ Transforms/SimpleLoopUnswitch/pr60736.ll
Transforms/SimpleLoopUnswitch/trivial-unswitch-freeze-individual-conditions.ll
Transforms/SimpleLoopUnswitch/trivial-unswitch.ll
Transforms/SimpleLoopUnswitch/trivial-unswitch-logical-and-or.ll
-Transforms/SLPVectorizer/AArch64/gather-root.ll
-Transforms/SLPVectorizer/AArch64/horizontal.ll
-Transforms/SLPVectorizer/AArch64/loadi8.ll
-Transforms/SLPVectorizer/AArch64/phi-node-bitwidt-op-not.ll
-Transforms/SLPVectorizer/AArch64/uselistorder.ll
-Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
-Transforms/SLPVectorizer/AArch64/vectorizable-selects-min-max.ll
-Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
-Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll
-Transforms/SLPVectorizer/bool-logical-op-reduction-with-poison.ll
-Transforms/SLPVectorizer/call-arg-reduced-by-minbitwidth.ll
-Transforms/SLPVectorizer/const-bool-logical-or-reduction.ll
-Transforms/SLPVectorizer/extracts-with-undefs.ll
-Transforms/SLPVectorizer/freeze-signedness-missed.ll
-Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll
-Transforms/SLPVectorizer/gather_extract_from_vectorbuild.ll
-Transforms/SLPVectorizer/insert-element-build-vector-const.ll
-Transforms/SLPVectorizer/insert-element-build-vector-inseltpoison.ll
-Transforms/SLPVectorizer/insert-element-build-vector.ll
-Transforms/SLPVectorizer/logical-ops-poisonous-repeated.ll
-Transforms/SLPVectorizer/minbitwidth-node-with-multi-users.ll
-Transforms/SLPVectorizer/minbitwidth-user-not-min.ll
-Transforms/SLPVectorizer/partial-register-extract.ll
-Transforms/SLPVectorizer/reduction-gather-non-scheduled-extracts.ll
-Transforms/SLPVectorizer/reorder-node.ll
-Transforms/SLPVectorizer/reused-buildvector-matching-vectorized-node.ll
-Transforms/SLPVectorizer/revec.ll
-Transforms/SLPVectorizer/RISCV/remarks_cmp_sel_min_max.ll
-Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll
-Transforms/SLPVectorizer/RISCV/reordered-interleaved-loads.ll
-Transforms/SLPVectorizer/RISCV/revec.ll
-Transforms/SLPVectorizer/RISCV/select-profitability.ll
-Transforms/SLPVectorizer/RISCV/shuffled-gather-casted.ll
-Transforms/SLPVectorizer/RISCV/unsigned-node-trunc-with-signed-users.ll
-Transforms/SLPVectorizer/slp-deleted-inst.ll
-Transforms/SLPVectorizer/SystemZ/cmp-ptr-minmax.ll
-Transforms/SLPVectorizer/SystemZ/ext-not-resized-op-resized.ll
-Transforms/SLPVectorizer/SystemZ/minbitwidth-trunc.ll
-Transforms/SLPVectorizer/X86/bool-mask.ll
-Transforms/SLPVectorizer/X86/bv-root-part-of-graph.ll
-Transforms/SLPVectorizer/X86/cmp-after-intrinsic-call-minbitwidth.ll
-Transforms/SLPVectorizer/X86/cmp-as-alternate-ops.ll
-Transforms/SLPVectorizer/X86/cmp_sel.ll
-Transforms/SLPVectorizer/X86/crash_7zip.ll
-Transforms/SLPVectorizer/X86/crash_clear_undefs.ll
-Transforms/SLPVectorizer/X86/crash_cmpop.ll
-Transforms/SLPVectorizer/X86/debug-counter.ll
-Transforms/SLPVectorizer/X86/debug-info-salvage.ll
-Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
-Transforms/SLPVectorizer/X86/extracts-non-extendable.ll
-Transforms/SLPVectorizer/X86/ext-used-scalar-different-bitwidth.ll
-Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll
-Transforms/SLPVectorizer/X86/horizontal-minmax.ll
-Transforms/SLPVectorizer/X86/insert-after-bundle.ll
-Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll
-Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll
-Transforms/SLPVectorizer/X86/minbw-user-non-sizable.ll
-Transforms/SLPVectorizer/X86/non-load-reduced-as-part-of-bv.ll
-Transforms/SLPVectorizer/X86/ordering-bug.ll
-Transforms/SLPVectorizer/X86/phi-node-bitwidt-op-not.ll
-Transforms/SLPVectorizer/X86/phi-node-reshuffled-part.ll
-Transforms/SLPVectorizer/X86/pr46983.ll
-Transforms/SLPVectorizer/X86/pr49933.ll
-Transforms/SLPVectorizer/X86/propagate_ir_flags.ll
-Transforms/SLPVectorizer/X86/reduction-bool-logic-op-inside.ll
-Transforms/SLPVectorizer/X86/reduction-logical.ll
-Transforms/SLPVectorizer/X86/resized-bv-values-non-power-of2-node.ll
-Transforms/SLPVectorizer/X86/reused-reductions-with-minbitwidth.ll
-Transforms/SLPVectorizer/X86/select-reduction-op.ll
-Transforms/SLPVectorizer/X86/shrink_after_reorder.ll
-Transforms/SLPVectorizer/X86/subvector-minbitwidth-unsigned-value.ll
-Transforms/SLPVectorizer/X86/undef_vect.ll
-Transforms/SLPVectorizer/X86/used-reduced-op.ll
-Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll
-Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
-Transforms/SLPVectorizer/X86/whole-registers-compare.ll
-Transforms/SROA/addrspacecast.ll
-Transforms/SROA/phi-and-select.ll
-Transforms/SROA/phi-gep.ll
-Transforms/SROA/scalable-vectors-with-known-vscale.ll
-Transforms/SROA/select-gep.ll
-Transforms/SROA/select-load.ll
-Transforms/SROA/slice-width.ll
-Transforms/SROA/std-clamp.ll
-Transforms/SROA/vector-conversion.ll
-Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll
-Transforms/SROA/vector-promotion.ll
Transforms/StackProtector/cross-dso-cfi-stack-chk-fail.ll
Transforms/StructurizeCFG/AMDGPU/uniform-regions.ll
Transforms/StructurizeCFG/hoist-zerocost.ll
diff --git a/llvm/utils/release/build_llvm_release.bat b/llvm/utils/release/build_llvm_release.bat
index 54645d0..001339f 100755..100644
--- a/llvm/utils/release/build_llvm_release.bat
+++ b/llvm/utils/release/build_llvm_release.bat
@@ -156,16 +156,14 @@ set common_cmake_flags=^
-DLLVM_BUILD_LLVM_C_DYLIB=ON ^
-DPython3_FIND_REGISTRY=NEVER ^
-DPACKAGE_VERSION=%package_version% ^
- -DLLDB_RELOCATABLE_PYTHON=1 ^
- -DLLDB_EMBED_PYTHON_HOME=OFF ^
-DCMAKE_CL_SHOWINCLUDES_PREFIX="Note: including file: " ^
-DLLVM_ENABLE_LIBXML2=FORCE_ON ^
- -DLLDB_ENABLE_LIBXML2=OFF ^
-DCLANG_ENABLE_LIBXML2=OFF ^
-DCMAKE_C_FLAGS="%common_compiler_flags%" ^
-DCMAKE_CXX_FLAGS="%common_compiler_flags%" ^
-DLLVM_ENABLE_RPMALLOC=ON ^
- -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;lld;compiler-rt;lldb;openmp"
+ -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;lld" ^
+ -DLLVM_ENABLE_RUNTIMES="compiler-rt;openmp"
if "%force-msvc%" == "" (
where /q clang-cl
@@ -185,6 +183,11 @@ if "%force-msvc%" == "" (
)
)
+set common_lldb_flags=^
+ -DLLDB_RELOCATABLE_PYTHON=1 ^
+ -DLLDB_EMBED_PYTHON_HOME=OFF ^
+ -DLLDB_ENABLE_LIBXML2=OFF
+
set cmake_profile_flags=""
REM Preserve original path
@@ -192,8 +195,8 @@ set OLDPATH=%PATH%
REM Build the 32-bits and/or 64-bits binaries.
if "%x86%" == "true" call :do_build_32 || exit /b 1
-if "%x64%" == "true" call :do_build_64 || exit /b 1
-if "%arm64%" == "true" call :do_build_arm64 || exit /b 1
+if "%x64%" == "true" call :do_build_64_common amd64 %python64_dir% || exit /b 1
+if "%arm64%" == "true" call :do_build_64_common arm64 %pythonarm64_dir% || exit /b 1
exit /b 0
::==============================================================================
@@ -212,8 +215,6 @@ set "stage0_bin_dir=%build_dir%/build32_stage0/bin"
set cmake_flags=^
%common_cmake_flags% ^
-DLLVM_ENABLE_RPMALLOC=OFF ^
- -DLLDB_TEST_COMPILER=%stage0_bin_dir%/clang.exe ^
- -DPYTHON_HOME=%PYTHONHOME% ^
-DPython3_ROOT_DIR=%PYTHONHOME% ^
-DLIBXML2_INCLUDE_DIR=%libxmldir%/include/libxml2 ^
-DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib
@@ -231,6 +232,9 @@ REM CMake expects the paths that specifies the compiler and linker to be
REM with forward slash.
set all_cmake_flags=^
%cmake_flags% ^
+ -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;lld;lldb;" ^
+ %common_lldb_flags% ^
+ -DPYTHON_HOME=%PYTHONHOME% ^
-DCMAKE_C_COMPILER=%stage0_bin_dir%/clang-cl.exe ^
-DCMAKE_CXX_COMPILER=%stage0_bin_dir%/clang-cl.exe ^
-DCMAKE_LINKER=%stage0_bin_dir%/lld-link.exe ^
@@ -254,32 +258,42 @@ exit /b 0
::==============================================================================
::==============================================================================
-:: Build 64-bits binaries.
+:: Build 64-bits binaries (common function for both x64 and arm64)
::==============================================================================
-:do_build_64
-call :set_environment %python64_dir% || exit /b 1
-call "%vsdevcmd%" -arch=amd64 || exit /b 1
+:do_build_64_common
+set arch=%1
+set python_dir=%2
+
+call :set_environment %python_dir% || exit /b 1
+call "%vsdevcmd%" -arch=%arch% || exit /b 1
@echo on
-mkdir build64_stage0
-cd build64_stage0
+mkdir build_%arch%_stage0
+cd build_%arch%_stage0
call :do_build_libxml || exit /b 1
REM Stage0 binaries directory; used in stage1.
-set "stage0_bin_dir=%build_dir%/build64_stage0/bin"
+set "stage0_bin_dir=%build_dir%/build_%arch%_stage0/bin"
set cmake_flags=^
%common_cmake_flags% ^
- -DLLDB_TEST_COMPILER=%stage0_bin_dir%/clang.exe ^
- -DPYTHON_HOME=%PYTHONHOME% ^
-DPython3_ROOT_DIR=%PYTHONHOME% ^
-DLIBXML2_INCLUDE_DIR=%libxmldir%/include/libxml2 ^
- -DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib
+ -DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib ^
+ -DCLANG_DEFAULT_LINKER=lld
+if "%arch%"=="arm64" (
+ set cmake_flags=%cmake_flags% ^
+ -DCOMPILER_RT_BUILD_SANITIZERS=OFF
+)
-cmake -GNinja %cmake_flags% %llvm_src%\llvm || exit /b 1
+cmake -GNinja %cmake_flags% ^
+ -DLLVM_TARGETS_TO_BUILD=Native ^
+ %llvm_src%\llvm || exit /b 1
ninja || ninja || ninja || exit /b 1
ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1
ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1
ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1
-ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1
+if "%arch%"=="amd64" (
+ ninja check-runtimes || ninja check-runtimes || ninja check-runtimes || exit /b 1
+)
ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1
ninja check-clangd || ninja check-clangd || ninja check-clangd || exit /b 1
cd..
@@ -293,24 +307,40 @@ set all_cmake_flags=^
-DCMAKE_LINKER=%stage0_bin_dir%/lld-link.exe ^
-DCMAKE_AR=%stage0_bin_dir%/llvm-lib.exe ^
-DCMAKE_RC=%stage0_bin_dir%/llvm-windres.exe
+if "%arch%"=="arm64" (
+ set all_cmake_flags=%all_cmake_flags% ^
+ -DCPACK_SYSTEM_NAME=woa64
+)
set cmake_flags=%all_cmake_flags:\=/%
-
-mkdir build64
-cd build64
+mkdir build_%arch%
+cd build_%arch%
call :do_generate_profile || exit /b 1
-cmake -GNinja %cmake_flags% %cmake_profile_flags% %llvm_src%\llvm || exit /b 1
+cmake -GNinja %cmake_flags% ^
+ -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;lld;lldb;flang;mlir" ^
+ %common_lldb_flags% ^
+ -DPYTHON_HOME=%PYTHONHOME% ^
+ %cmake_profile_flags% %llvm_src%\llvm || exit /b 1
ninja || ninja || ninja || exit /b 1
ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1
ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1
ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1
-ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1
+if "%arch%"=="amd64" (
+ ninja check-runtimes || ninja check-runtimes || ninja check-runtimes || exit /b 1
+)
ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1
ninja check-clangd || ninja check-clangd || ninja check-clangd || exit /b 1
+REM ninja check-flang || ninja check-flang || ninja check-flang || exit /b 1
+REM ninja check-mlir || ninja check-mlir || ninja check-mlir || exit /b 1
+REM ninja check-lldb || ninja check-lldb || ninja check-lldb || exit /b 1
ninja package || exit /b 1
:: generate tarball with install toolchain only off
-set filename=clang+llvm-%version%-x86_64-pc-windows-msvc
+if "%arch%"=="amd64" (
+ set filename=clang+llvm-%version%-x86_64-pc-windows-msvc
+) else (
+ set filename=clang+llvm-%version%-aarch64-pc-windows-msvc
+)
cmake -GNinja %cmake_flags% %cmake_profile_flags% -DLLVM_INSTALL_TOOLCHAIN_ONLY=OFF ^
-DCMAKE_INSTALL_PREFIX=%build_dir%/%filename% ..\llvm-project\llvm || exit /b 1
ninja install || exit /b 1
@@ -320,75 +350,7 @@ cd ..
7z a -ttar -so %filename%.tar %filename% | 7z a -txz -si %filename%.tar.xz
exit /b 0
-::==============================================================================
-
-::==============================================================================
-:: Build arm64 binaries.
-::==============================================================================
-:do_build_arm64
-call :set_environment %pythonarm64_dir% || exit /b 1
-call "%vsdevcmd%" -host_arch=x64 -arch=arm64 || exit /b 1
-@echo on
-mkdir build_arm64_stage0
-cd build_arm64_stage0
-call :do_build_libxml || exit /b 1
-
-REM Stage0 binaries directory; used in stage1.
-set "stage0_bin_dir=%build_dir%/build_arm64_stage0/bin"
-set cmake_flags=^
- %common_cmake_flags% ^
- -DCLANG_DEFAULT_LINKER=lld ^
- -DLIBXML2_INCLUDE_DIR=%libxmldir%/include/libxml2 ^
- -DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib ^
- -DPython3_ROOT_DIR=%PYTHONHOME% ^
- -DCOMPILER_RT_BUILD_PROFILE=OFF ^
- -DCOMPILER_RT_BUILD_SANITIZERS=OFF
-
-REM We need to build stage0 compiler-rt with clang-cl (msvc lacks some builtins).
-cmake -GNinja %cmake_flags% ^
- -DCMAKE_C_COMPILER=clang-cl.exe ^
- -DCMAKE_CXX_COMPILER=clang-cl.exe ^
- %llvm_src%\llvm || exit /b 1
-ninja || exit /b 1
-::ninja check-llvm || exit /b 1
-::ninja check-clang || exit /b 1
-::ninja check-lld || exit /b 1
-::ninja check-sanitizer || exit /b 1
-::ninja check-clang-tools || exit /b 1
-::ninja check-clangd || exit /b 1
-cd..
-
-REM CMake expects the paths that specifies the compiler and linker to be
-REM with forward slash.
-REM CPACK_SYSTEM_NAME is set to have a correct name for installer generated.
-set all_cmake_flags=^
- %cmake_flags% ^
- -DCMAKE_C_COMPILER=%stage0_bin_dir%/clang-cl.exe ^
- -DCMAKE_CXX_COMPILER=%stage0_bin_dir%/clang-cl.exe ^
- -DCMAKE_LINKER=%stage0_bin_dir%/lld-link.exe ^
- -DCMAKE_AR=%stage0_bin_dir%/llvm-lib.exe ^
- -DCMAKE_RC=%stage0_bin_dir%/llvm-windres.exe ^
- -DCPACK_SYSTEM_NAME=woa64
-set cmake_flags=%all_cmake_flags:\=/%
-mkdir build_arm64
-cd build_arm64
-cmake -GNinja %cmake_flags% %llvm_src%\llvm || exit /b 1
-ninja || exit /b 1
-REM Check but do not fail on errors.
-ninja check-lldb
-::ninja check-llvm || exit /b 1
-::ninja check-clang || exit /b 1
-::ninja check-lld || exit /b 1
-::ninja check-sanitizer || exit /b 1
-::ninja check-clang-tools || exit /b 1
-::ninja check-clangd || exit /b 1
-ninja package || exit /b 1
-cd ..
-
-exit /b 0
-::==============================================================================
-::
::==============================================================================
:: Set PATH and some environment variables.
::==============================================================================
diff --git a/llvm/utils/unicode-case-fold.py b/llvm/utils/unicode-case-fold.py
index 9639aa0..4afb41d 100755
--- a/llvm/utils/unicode-case-fold.py
+++ b/llvm/utils/unicode-case-fold.py
@@ -21,11 +21,7 @@ from __future__ import print_function
import sys
import re
-
-try:
- from urllib.request import urlopen
-except ImportError:
- from urllib2 import urlopen
+from urllib.request import urlopen
# This variable will body of the mappings function
diff --git a/llvm/utils/vim/syntax/llvm.vim b/llvm/utils/vim/syntax/llvm.vim
index e048caa..cbff478d 100644
--- a/llvm/utils/vim/syntax/llvm.vim
+++ b/llvm/utils/vim/syntax/llvm.vim
@@ -220,7 +220,7 @@ syn keyword llvmError getresult begin end
syn match llvmNoName /[%@!]\d\+\>/
syn match llvmNumber /-\?\<\d\+\>/
syn match llvmFloat /-\?\<\d\+\.\d*\(e[+-]\d\+\)\?\>/
-syn match llvmFloat /\<0x[KLMHR]\?\x\+\>/
+syn match llvmFloat /\<\(u\|s\)\?0x[KLMHR]\?\x\+\>/
syn keyword llvmBoolean true false
syn keyword llvmConstant zeroinitializer undef null none poison vscale
syn match llvmComment /;.*$/